filters() function

The filters() parses a maths-like syntax to be used with Dataset_query().
For now it only works with the original column names and not the fields titles. The available Filter operators are documented here: solvebio API

library(quartzbio.edp, quiet = TRUE)
Sys.setenv(EDP_PROFILE = "vsim-dev_rw")
ds <- Dataset(full_path =  "quartzbio:Public:/TCGA/2.0.0-2018-mc3-v0.2.8/PatientsDiseases")
#> Loading required package: RcppSimdJson
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v1/user...
#> Connected to https://vsim-dev.api.edp.aws.quartz.bio with user "Karl Forner" using an API Token
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
df1 <- Dataset_query(ds, limit = 2000)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/1959470796697098463/data...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/1959470796697098463/fields...
#> took 0.169 to process the meta data.
dplyr::count(df1, DISEASE)
#>   DISEASE   n
#> 1     ACC  76
#> 2    BLCA 399
#> 3    BRCA 981
#> 4    CESC 272
#> 5    CHOL  36
#> 6    COAD 236

Simple filter


# build a filter list with the new syntax
fi <- filters('disease = "BLCA"')
dplyr::glimpse(fi)
#> List of 1
#>  $ :List of 2
#>   ..$ : chr "disease"
#>   ..$ : chr "BLCA"
df1 <- Dataset_query(ds,  filters = fi, limit = 2000)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/1959470796697098463/data...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/1959470796697098463/fields...
#> took 0.162 to process the meta data.
dplyr::count(df1, DISEASE)
#>   DISEASE   n
#> 1    BLCA 399

# how to obtain original column names matching the Fields
fields <- as.data.frame(DatasetFields(ds))
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/1959470796697098463/fields...
fields[, c('title', 'name')]
#>                           title            name
#> patient_barcode PATIENT_BARCODE patient_barcode
#> sample_barcode   SAMPLE_BARCODE  sample_barcode
#> disease                 DISEASE         disease
#> subtype                 SUBTYPE         subtype
#> _commit                 _commit         _commit

# filter does not work as we used the capitalized DISEASE Field within this filter 
fi <- filters('DISEASE = "BLCA"')
try(df1 <- Dataset_query(ds,  filters = fi, limit = 2000))
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/1959470796697098463/data...
#> Error : API error: Invalid field "DISEASE" used in filter. Did you mean "disease"?

Combined Filters

fi <- filters('(disease = "BLCA") AND (sample_barcode regexp ".*-4.*")')
df <- Dataset_query(ds, filters = fi)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/1959470796697098463/data...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/1959470796697098463/fields...
#> took 0.165 to process the meta data.
dplyr::slice_sample(df, n = 3)
#>   PATIENT_BARCODE  SAMPLE_BARCODE DISEASE        SUBTYPE
#> 1    TCGA-4Z-AA7O TCGA-4Z-AA7O-01    BLCA Not_Applicable
#> 2    TCGA-4Z-AA83 TCGA-4Z-AA83-01    BLCA Not_Applicable
#> 3    TCGA-4Z-AA80 TCGA-4Z-AA80-01    BLCA Not_Applicable

Use JSON filters

Filters can also be passed to the Dataset_query() in JSON format (as string).
It is convenient for example to use filters generated in the EDP web UI.


# the same filter but  using the JSON format  
fi <- filters('(disease = "BLCA") AND (sample_barcode regexp ".*-4.*")')

# generates the json version of the filter
fi_json <- jsonlite::toJSON(fi, pretty =TRUE, auto_unbox= TRUE)
fi_json
#> [
#>   {
#>     "and": [
#>       [
#>         "disease",
#>         "BLCA"
#>       ],
#>       [
#>         "sample_barcode__regexp",
#>         ".*-4.*"
#>       ]
#>     ]
#>   }
#> ]
df_json <- Dataset_query(ds, filters = fi_json)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/1959470796697098463/data...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/1959470796697098463/fields...
#> took 0.263 to process the meta data.
dplyr::slice_sample(df_json, n = 3)
#>   PATIENT_BARCODE  SAMPLE_BARCODE DISEASE        SUBTYPE
#> 1    TCGA-4Z-AA7Y TCGA-4Z-AA7Y-01    BLCA Not_Applicable
#> 2    TCGA-4Z-AA7O TCGA-4Z-AA7O-01    BLCA Not_Applicable
#> 3    TCGA-4Z-AA86 TCGA-4Z-AA86-01    BLCA Not_Applicable
# results are identical
all.equal(df_json, df)
#> [1] TRUE