Skip to contents

Read only part of data

library(quartzbio.edp, quiet = TRUE)
Sys.setenv(EDP_PROFILE = "vsim-dev_rw")


# contains > 3 millions of rows
ds <- Dataset(full_path =  "quartzbio:Public:/TCGA/2.0.0-2018-mc3-v0.2.8/PatientsDiseases")
#> Loading required package: RcppSimdJson
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v1/user...
#> Connected to https://vsim-dev.api.edp.aws.quartz.bio with user "Karl Forner" using an API Token
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
ds$documents_count
#> [1] 9125

# fetch first 1000 rows
system.time(df1 <- Dataset_query(ds, limit = 1000, meta = FALSE))
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/1959470796697098463/data...
#>    user  system elapsed 
#>   0.006   0.000   0.173
dplyr::slice_sample(df1, n=4)
#>          subtype patient_barcode disease  sample_barcode
#> 1          Basal    TCGA-AN-A0XU    BRCA TCGA-AN-A0XU-01
#> 2           Her2    TCGA-A2-A0CX    BRCA TCGA-A2-A0CX-01
#> 3           LumA    TCGA-AC-A23E    BRCA TCGA-AC-A23E-01
#> 4 Not_Applicable    TCGA-OR-A5JB     ACC TCGA-OR-A5JB-01

Read all data

# getting all, and showing progression
library(progressr)

# no parallelisation
# with_progress: add a progress bar only for specific call
with_progress(system.time(df <- fetch_all(df1, workers = 1, verbose = FALSE)))
#>    user  system elapsed 
#>   0.217   0.024   1.811

# using workers to parallelize queries within the EDP host.
# default is to use 4 workers
with_progress(system.time(df <- fetch_all(df1, verbose = FALSE)))
#>    user  system elapsed 
#>   0.167   0.000   1.311

Estimate download time

dsbig <- Dataset(full_path = "quartzbio:Public:/TCGA/2.0.0-2018-mc3-v0.2.8/SomaticMutations-GRCh37")
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
# contains more than 3 millions of rows
dsbig$documents_count
#> [1] 3603497
# default max page size limit is 10,000 in EDP
# some attributes can be useful to anticipate the time needed to obtain the complete dataset
dfb <- Dataset_query(dsbig, limit = 1000, meta = FALSE)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/1959547506112563037/data...
attrs <- attributes(dfb)

# Total number of rows
attrs$total
#> [1] 3603497

# Pagination info: index, number of pages, total number of rows and chosen page size, 
attrs$pagination$page_index
#> $index
#> [1] 1
#> 
#> $nb
#> [1] 3604
#> 
#> $total
#> [1] 3603497
#> 
#> $size
#> [1] 1000