Filters
Filters.Rmd
filters() function
The filters() parses a maths-like syntax to be used
with Dataset_query().
For now it only works with the original column names and not the fields
titles. The available Filter operators are documented here: solvebio
API
library(quartzbio.edp, quiet = TRUE)
Sys.setenv(EDP_PROFILE = "vsim-dev_rw")
ds <- Dataset(full_path = "quartzbio:Public:/TCGA/2.0.0-2018-mc3-v0.2.8/PatientsDiseases")
#> Loading required package: RcppSimdJson
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v1/user...
#> Connected to https://vsim-dev.api.edp.aws.quartz.bio with user "Karl Forner" using an API Token
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
df1 <- Dataset_query(ds, limit = 2000)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/1959470796697098463/data...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/1959470796697098463/fields...
#> took 0.169 to process the meta data.
dplyr::count(df1, DISEASE)
#> DISEASE n
#> 1 ACC 76
#> 2 BLCA 399
#> 3 BRCA 981
#> 4 CESC 272
#> 5 CHOL 36
#> 6 COAD 236
Simple filter
# build a filter list with the new syntax
fi <- filters('disease = "BLCA"')
dplyr::glimpse(fi)
#> List of 1
#> $ :List of 2
#> ..$ : chr "disease"
#> ..$ : chr "BLCA"
df1 <- Dataset_query(ds, filters = fi, limit = 2000)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/1959470796697098463/data...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/1959470796697098463/fields...
#> took 0.162 to process the meta data.
dplyr::count(df1, DISEASE)
#> DISEASE n
#> 1 BLCA 399
# how to obtain original column names matching the Fields
fields <- as.data.frame(DatasetFields(ds))
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/1959470796697098463/fields...
fields[, c('title', 'name')]
#> title name
#> patient_barcode PATIENT_BARCODE patient_barcode
#> sample_barcode SAMPLE_BARCODE sample_barcode
#> disease DISEASE disease
#> subtype SUBTYPE subtype
#> _commit _commit _commit
# filter does not work as we used the capitalized DISEASE Field within this filter
fi <- filters('DISEASE = "BLCA"')
try(df1 <- Dataset_query(ds, filters = fi, limit = 2000))
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/1959470796697098463/data...
#> Error : API error: Invalid field "DISEASE" used in filter. Did you mean "disease"?
Combined Filters
fi <- filters('(disease = "BLCA") AND (sample_barcode regexp ".*-4.*")')
df <- Dataset_query(ds, filters = fi)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/1959470796697098463/data...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/1959470796697098463/fields...
#> took 0.165 to process the meta data.
dplyr::slice_sample(df, n = 3)
#> PATIENT_BARCODE SAMPLE_BARCODE DISEASE SUBTYPE
#> 1 TCGA-4Z-AA7O TCGA-4Z-AA7O-01 BLCA Not_Applicable
#> 2 TCGA-4Z-AA83 TCGA-4Z-AA83-01 BLCA Not_Applicable
#> 3 TCGA-4Z-AA80 TCGA-4Z-AA80-01 BLCA Not_Applicable
Use JSON filters
Filters can also be passed to the Dataset_query() in JSON format (as
string).
It is convenient for example to use filters generated in the EDP web
UI.
# the same filter but using the JSON format
fi <- filters('(disease = "BLCA") AND (sample_barcode regexp ".*-4.*")')
# generates the json version of the filter
fi_json <- jsonlite::toJSON(fi, pretty =TRUE, auto_unbox= TRUE)
fi_json
#> [
#> {
#> "and": [
#> [
#> "disease",
#> "BLCA"
#> ],
#> [
#> "sample_barcode__regexp",
#> ".*-4.*"
#> ]
#> ]
#> }
#> ]
df_json <- Dataset_query(ds, filters = fi_json)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/1959470796697098463/data...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/1959470796697098463/fields...
#> took 0.263 to process the meta data.
dplyr::slice_sample(df_json, n = 3)
#> PATIENT_BARCODE SAMPLE_BARCODE DISEASE SUBTYPE
#> 1 TCGA-4Z-AA7Y TCGA-4Z-AA7Y-01 BLCA Not_Applicable
#> 2 TCGA-4Z-AA7O TCGA-4Z-AA7O-01 BLCA Not_Applicable
#> 3 TCGA-4Z-AA86 TCGA-4Z-AA86-01 BLCA Not_Applicable
# results are identical
all.equal(df_json, df)
#> [1] TRUE