Parallelisation
Parallelisation.Rmd
Read only part of data
library(quartzbio.edp, quiet = TRUE)
Sys.setenv(EDP_PROFILE = "vsim-dev_rw")
# contains > 3 millions of rows
ds <- Dataset(full_path = "quartzbio:Public:/TCGA/2.0.0-2018-mc3-v0.2.8/PatientsDiseases")
#> Loading required package: RcppSimdJson
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v1/user...
#> Connected to https://vsim-dev.api.edp.aws.quartz.bio with user "Karl Forner" using an API Token
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
ds$documents_count
#> [1] 9125
# fetch first 1000 rows
system.time(df1 <- Dataset_query(ds, limit = 1000, meta = FALSE))
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/1959470796697098463/data...
#> user system elapsed
#> 0.006 0.000 0.173
dplyr::slice_sample(df1, n=4)
#> subtype patient_barcode disease sample_barcode
#> 1 Basal TCGA-AN-A0XU BRCA TCGA-AN-A0XU-01
#> 2 Her2 TCGA-A2-A0CX BRCA TCGA-A2-A0CX-01
#> 3 LumA TCGA-AC-A23E BRCA TCGA-AC-A23E-01
#> 4 Not_Applicable TCGA-OR-A5JB ACC TCGA-OR-A5JB-01
Read all data
# getting all, and showing progression
library(progressr)
# no parallelisation
# with_progress: add a progress bar only for specific call
with_progress(system.time(df <- fetch_all(df1, workers = 1, verbose = FALSE)))
#> user system elapsed
#> 0.217 0.024 1.811
# using workers to parallelize queries within the EDP host.
# default is to use 4 workers
with_progress(system.time(df <- fetch_all(df1, verbose = FALSE)))
#> user system elapsed
#> 0.167 0.000 1.311
Estimate download time
dsbig <- Dataset(full_path = "quartzbio:Public:/TCGA/2.0.0-2018-mc3-v0.2.8/SomaticMutations-GRCh37")
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
# contains more than 3 millions of rows
dsbig$documents_count
#> [1] 3603497
# default max page size limit is 10,000 in EDP
# some attributes can be useful to anticipate the time needed to obtain the complete dataset
dfb <- Dataset_query(dsbig, limit = 1000, meta = FALSE)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/1959547506112563037/data...
attrs <- attributes(dfb)
# Total number of rows
attrs$total
#> [1] 3603497
# Pagination info: index, number of pages, total number of rows and chosen page size,
attrs$pagination$page_index
#> $index
#> [1] 1
#>
#> $nb
#> [1] 3604
#>
#> $total
#> [1] 3603497
#>
#> $size
#> [1] 1000