Example of using DuckDB to convert from JSON to Parquet

Load needed packages

library(httr)
library(jsonlite)
library(openalexR)

Thank you for using openalexR!
To acknowledge our work, please cite the package by calling `citation("openalexR")`.
To suppress this message, add `openalexR.message = suppressed` to your .Renviron file.

## For doing the conversion
library(DBI)

library(duckdb)

if (packageVersion("duckdb") < "1.0.99.9000") {
  warning("`duckdb` versions smaller than 1.0.99.9000 have a bug \nin the export to hive partitioned parquet files \nwhich can result in invalid parquet datasets!")
}

library(tibble)

## Just for timing and other useful stuff in the report
library(tictoc)
library(knitr)

list.files(
  path = "R",
  pattern = ".R$",
  full.names = TRUE,
  recursive = FALSE
) |>
  sapply(
    FUN = source
  )

        R/api_request.R R/json_to_parquet.R R/json_to_tibble.R R/oa_request.R
value   ?               ?                   ?                  ?             
visible FALSE           FALSE               FALSE              FALSE

unlink(
  "data",
  recursive = TRUE,
  force = TRUE
)
dir.create(
  "data"
)

# search_term <- "toast AND biodiversity" # about 800 records
# search_term <- '"deep sea" AND fishing AND illegal' # about 2500 records
search_term <- '"deep sea" AND fishing' # about 18600 records

Run `openalexR::oa_fetch() first time aet caches

tic()

x <- openalexR::oa_fetch(
  entity = "works",
  fulltext.search = search_term,
  output = "tibble"
)

Warning in oa_request(oa_query(filter = filter_i, multiple_id = multiple_id, : 
The following work(s) have truncated lists of authors: W2088891049, W2970419732, W4250796065.
Query each work separately by its identifier to get full list of authors.
For example:
  lapply(c("W2088891049", "W2970419732"), \(x) oa_fetch(identifier = x))
Details at https://docs.openalex.org/api-entities/authors/limitations.

toc()

117.372 sec elapsed

Run `openalexR::oa_fetch() for timing

tic()

x <- openalexR::oa_fetch(
  entity = "works",
  fulltext.search = search_term,
  output = "tibble"
)

Warning in oa_request(oa_query(filter = filter_i, multiple_id = multiple_id, : 
The following work(s) have truncated lists of authors: W2088891049, W2970419732, W4250796065.
Query each work separately by its identifier to get full list of authors.
For example:
  lapply(c("W2088891049", "W2970419732"), \(x) oa_fetch(identifier = x))
Details at https://docs.openalex.org/api-entities/authors/limitations.

toc()

111.31 sec elapsed

saveRDS(x, file = file.path("data", "data_from_fetch.rds"))
x

# A tibble: 16,950 × 39
   id     title display_name author ab    publication_date relevance_score so   
   <chr>  <chr> <chr>        <list> <chr> <chr>                      <dbl> <chr>
 1 https… Accu… Accumulatio… <df>   "One… 2009-07-27                 1113. Phil…
 2 https… The … The Biodive… <df>   "The… 2010-08-02                  785. PLoS…
 3 https… Acce… Acceleratin… <df>   "Coa… 2009-07-28                  748. Proc…
 4 https… Plas… Plastics, t… <df>   "Pla… 2009-07-27                  689. Phil…
 5 https… Micr… Microplasti… <df>   "Thi… 2012-03-02                  655. Envi…
 6 https… Fish… Fishes of t… <df>    <NA> 1987-08-01                  638. Fish…
 7 https… The … The Product… <df>   "Eco… 2007-07-27                  622. Scie…
 8 https… Glob… Global impr… <df>   "Res… 2013-08-04                  581. Natu…
 9 https… Exti… Extinction … <df>   "The… 2014-01-21                  565. eLife
10 https… Man … Man and the… <df>   "The… 2011-08-01                  562. PLoS…
# ℹ 16,940 more rows
# ℹ 31 more variables: so_id <chr>, host_organization <chr>, issn_l <chr>,
#   url <chr>, pdf_url <chr>, license <chr>, version <chr>, first_page <chr>,
#   last_page <chr>, volume <chr>, issue <chr>, is_oa <lgl>,
#   is_oa_anywhere <lgl>, oa_status <chr>, oa_url <chr>,
#   any_repository_has_fulltext <lgl>, language <chr>, grants <list>,
#   cited_by_count <int>, counts_by_year <list>, publication_year <int>, …

Get the records as JSON files as returned from OpenAlex

tic()

json_dir <- file.path("data", "json")
dir.create(
  json_dir,
  recursive = TRUE
)

openalexR::oa_query(
  entity = "works",
  fulltext.search = search_term
) |>
  oa_request(
    json_dir = json_dir
  )

Warning in oa_request(openalexR::oa_query(entity = "works", fulltext.search = search_term), : 
The following work(s) have truncated lists of authors: W2088891049, W2970419732, W4250796065.
Query each work separately by its identifier to get full list of authors.
For example:
  lapply(c("W2088891049", "W2970419732"), \(x) oa_fetch(identifier = x))
Details at https://docs.openalex.org/api-entities/authors/limitations.

[1] "data/json"

toc()

55.683 sec elapsed

Convert them to parquet format

tic()
json_to_parquet(
  json_dir = file.path("data", "json"),
  arrow_dir = file.path("data", "data")
)

[1] 16950

toc()

4.331 sec elapsed

Read as tibble

tic()
x <- json_to_tibble(
  json_dir = file.path("data", "json")
)
save(x, file = file.path("data", "data_from_json.rds"))
toc()

14.717 sec elapsed

# A tibble: 16,950 × 51
   id                  doi   title display_name relevance_score publication_year
   <chr>               <chr> <chr> <chr>                  <dbl>            <dbl>
 1 https://openalex.o… http… Accu… Accumulatio…           1113.             2009
 2 https://openalex.o… http… The … The Biodive…            785.             2010
 3 https://openalex.o… http… Acce… Acceleratin…            748.             2009
 4 https://openalex.o… http… Plas… Plastics, t…            689.             2009
 5 https://openalex.o… http… Micr… Microplasti…            655.             2012
 6 https://openalex.o… http… Fish… Fishes of t…            638.             1987
 7 https://openalex.o… http… The … The Product…            622.             2007
 8 https://openalex.o… http… Glob… Global impr…            581.             2013
 9 https://openalex.o… http… Exti… Extinction …            565.             2014
10 https://openalex.o… http… Man … Man and the…            562.             2011
# ℹ 16,940 more rows
# ℹ 45 more variables: publication_date <date>, ids <df[,5]>, language <chr>,
#   primary_location <df[,9]>, type <chr>, type_crossref <chr>,
#   indexed_in <list>, open_access <df[,4]>, authorships <list>,
#   countries_distinct_count <dbl>, institutions_distinct_count <dbl>,
#   corresponding_author_ids <list>, corresponding_institution_ids <list>,
#   apc_list <df[,4]>, apc_paid <df[,4]>, fwci <dbl>, has_fulltext <lgl>, …