Skip to contents

TL;DR

library(quartzbio.edp, quiet = TRUE)
Sys.getenv('EDP_PROFILE')
#> [1] ""
Sys.setenv(EDP_PROFILE = 'vsim-dev_rw')
Sys.getenv('EDP_PROFILE')
#> [1] "vsim-dev_rw"

vlts <- Vaults()
#> Loading required package: RcppSimdJson
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v1/user...
#> Connected to https://vsim-dev.api.edp.aws.quartz.bio with user "Karl Forner" using an API Token
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults...
u <- User()
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v1/user...
u$first_name
#> [1] "Karl"
vme <- Vault_create(paste0('vault_', u$first_name))
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults...

Vault(id = vme)
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults/2666...
#> Vault "vsim-dev:vault_Karl" ("", general, RWA), @ "Karl Forner", updated at:2023-06-06T09:54:59.653Z
Folder_create(vme, 'data/cyto')
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> folder "vsim-dev:vault_Karl:/data/cyto" (application/vnd.solvebio.folder) nb:0 user:Karl Forner accessed:2023-06-06T09:55:00.418Z
Folder_create(vme, 'data/flow')
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> folder "vsim-dev:vault_Karl:/data/flow" (application/vnd.solvebio.folder) nb:0 user:Karl Forner accessed:2023-06-06T09:55:00.751Z
Folder_create(vme, 'source/code')
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> folder "vsim-dev:vault_Karl:/source/code" (application/vnd.solvebio.folder) nb:0 user:Karl Forner accessed:2023-06-06T09:55:01.261Z
Folders(vault_id=vme)
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> EDP List of 5 Objects
#>           path object_type vault_name   user_name            last_modified
#> 1        /data      folder vault_Karl Karl Forner 2023-06-06T09:55:00.189Z
#> 2   /data/cyto      folder vault_Karl Karl Forner 2023-06-06T09:55:00.418Z
#> 3   /data/flow      folder vault_Karl Karl Forner 2023-06-06T09:55:00.751Z
#> 4      /source      folder vault_Karl Karl Forner 2023-06-06T09:55:01.079Z
#> 5 /source/code      folder vault_Karl Karl Forner 2023-06-06T09:55:01.261Z

delete(vme)
#> DELETE https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults/2666...
#> Vault "vsim-dev:vault_Karl" ("", general, rwa), @ "Karl Forner", updated at:2023-06-06T09:55:01.660Z

User

# select an EDP instance using a profile

u <- User()
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v1/user...
u$first_name
#> [1] "Karl"
u$last_name
#> [1] "Forner"
u$full_name
#> [1] "Karl Forner"
u$id
#> [1] 51
u$email
#> [1] "karl.forner@precisionformedicine.com"
u$account$name
#> [1] "vsim-dev"
u$url
#> [1] "https://vsim-dev.api.edp.aws.quartz.bio/v1/users/51"

Vaults

# select an EDP instance using a profile
Sys.setenv(EDP_PROFILE = 'vsim-dev_rw')
# fetch personnal vault info
myV <- Vault()
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v1/user...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults...
# personal vault: user-id
myV$name
#> [1] "user-51"
myV$full_path 
#> [1] "vsim-dev:user-51"
myV$has_children  
#> [1] TRUE
myV$has_folder_children
#> [1] TRUE
myV$user$full_name
#> [1] "Karl Forner"
myV$permissions
#> $read
#> [1] TRUE
#> 
#> $write
#> [1] TRUE
#> 
#> $admin
#> [1] FALSE

Vault creation

# create a vault
v <- Vault_create('vault_test_1', description = 'test_1', tags = list('tag1', 'tag2'))
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults...
v$name
#> [1] "vault_test_1"
v$full_path
#> [1] "vsim-dev:vault_test_1"

# retrieve a vault by name
Vault(name = 'vault_test_1')
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults...
#> Vault "vsim-dev:vault_test_1" ("test_1", general, RWA), @ "Karl Forner", updated at:2023-06-06T09:55:02.501Z

# retrieve a vault by full_path
Vault(full_path = v$full_path)
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults...
#> Vault "vsim-dev:vault_test_1" ("test_1", general, RWA), @ "Karl Forner", updated at:2023-06-06T09:55:02.501Z

v$description
#> [1] "test_1"
v$metadata
#> NULL
v$tags
#> [[1]]
#> [1] "tag1"
#> 
#> [[2]]
#> [1] "tag2"

# update metadata
new_name <- 'test_one'
v2 <- Vault_update(v, 
      name = new_name, 
      description = 'barabor', 
      metadata = list(meta_1 = 'foo'), 
      storage_class = 'Performance', tags = 'tag_A')
#> PUT https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults/2667...

v3 <- update(v2, storage_class = 'Temporary')
#> PUT https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults/2667...

Vaults listing, limit and paging

  • Vaults are ordered per locale “LC_TIME”, “us”
  • Intermediate pages can sized using limit
  • fetch_next and fetch_all use that limit
# get the firt two ordered vaults
vs1 <- Vaults(limit = 2, page = 1)
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults...

# get the the third and fourth vault
Vaults(limit = 2, page = 2)
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults...
#> List of 2 Vaults
#>     id            name                full_path      user_name vault_type               created_at
#> 1   19          Public         quartzbio:Public Admin Dev user    general 2022-11-02T23:38:43.735Z
#> 2 1300 Public Data Dev vsim-dev:Public Data Dev   David Caplan    general 2023-02-15T20:07:31.859Z

# same as above
vs2 <- fetch_next(vs1)
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults...
vs2
#> List of 2 Vaults
#>     id            name                full_path      user_name vault_type               created_at
#> 1   19          Public         quartzbio:Public Admin Dev user    general 2022-11-02T23:38:43.735Z
#> 2 1300 Public Data Dev vsim-dev:Public Data Dev   David Caplan    general 2023-02-15T20:07:31.859Z

# fetch all remaining vaults by pages of size 2
all_vlts <- fetch_all(vs1)
all_vlts_df <- as.data.frame(all_vlts)

# delete vault
delete(v)
#> DELETE https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults/2667...
#> Vault "vsim-dev:test_one" ("barabor", general, rwa), @ "Karl Forner", updated at:2023-06-06T09:55:04.616Z

Folders

# list all folders in an account, recursive by default
all_folders <- Folders()
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...

# control depth 
suppressMessages(Folders(depth=0))
#> EDP List of 18 Objects
#>                    path object_type      vault_name           user_name            last_modified
#> 1              /ClinGen      folder Public Data Dev        David Caplan 2023-02-15T20:07:39.539Z
#> 2              /ClinVar      folder          Public        David Caplan 2022-11-10T19:46:56.464Z
#> 3  /Dandan Testing CTMS      folder             ICF           Dandan Xu 2023-02-03T01:11:53.077Z
#> 4                 /dir1      folder         user-51         Karl Forner 2023-03-02T14:02:17.545Z
#> 5              /GENCODE      folder          Public        David Caplan 2022-11-13T01:24:41.193Z
#> 6                 /HGNC      folder          Public        David Caplan 2022-11-02T23:42:11.247Z
#> 7              /MEDLINE      folder          Public        David Caplan 2022-11-03T00:22:13.193Z
#> 8            /recursive      folder         user-51         Karl Forner 2023-03-02T14:56:40.353Z
#> 9           /r_examples      folder         user-51         Karl Forner 2023-03-17T10:31:37.489Z
#> 10 /Study ICF Templates      folder             ICF           Dandan Xu 2023-02-03T00:52:20.747Z
#> 11                /TCGA      folder          Public Tatjana Damnjanovic 2023-01-18T12:07:10.392Z
#> 12                  /v1      folder         user-51         Karl Forner 2023-05-11T08:44:15.267Z

suppressMessages(Folders(depth=1))
#> EDP List of 11 Objects
#>                                       path object_type vault_name           user_name            last_modified
#> 1                  /ClinVar/5.2.0-20210110      folder     Public        David Caplan 2022-11-10T19:47:11.142Z
#> 2                  /ClinVar/5.2.0-20221105      folder     Public        David Caplan 2022-11-10T20:32:55.867Z
#> 3                               /dir1/dir2      folder    user-51         Karl Forner 2023-03-02T14:02:17.736Z
#> 4                        /GENCODE/3.0.0-42      folder     Public        David Caplan 2022-11-13T01:24:41.387Z
#> 5                   /HGNC/3.3.1-2021-08-25      folder     Public        David Caplan 2022-11-02T23:45:22.368Z
#> 6                      /MEDLINE/2.3.4-2018      folder     Public        David Caplan 2022-11-03T00:22:26.392Z
#> 7                        /recursive/folder      folder    user-51         Karl Forner 2023-03-02T14:56:40.536Z
#> 8              /TCGA/2.0.0-2018-mc3-v0.2.8      folder     Public Tatjana Damnjanovic 2023-01-18T14:41:02.586Z


# get first four
Folders(limit = 4)
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> EDP List of 4 Objects
#>                      path object_type      vault_name    user_name            last_modified
#> 1                /ClinGen      folder Public Data Dev David Caplan 2023-02-15T20:07:39.539Z
#> 2                /ClinVar      folder          Public David Caplan 2022-11-10T19:46:56.464Z
#> 3 /ClinVar/5.2.0-20210110      folder          Public David Caplan 2022-11-10T19:47:11.142Z
#> 4 /ClinVar/5.2.0-20221105      folder          Public David Caplan 2022-11-10T20:32:55.867Z

# create folder with description and tags
v1 <- Vault_create('_an_upload',  description = 'upload', tags = list('fake', 'can_be_removed'))
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults...
fdata <- Folder_create(v1, 'data')
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
fdata
#> folder "vsim-dev:_an_upload:/data" (application/vnd.solvebio.folder) nb:0 user:Karl Forner accessed:2023-06-06T09:55:06.147Z

# CAUTION, no overwritting per default, folders are renamed incrementally
# a new folder data-1 is created
Folder_create(v1, 'data')
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> folder "vsim-dev:_an_upload:/data-1" (application/vnd.solvebio.folder) nb:0 user:Karl Forner accessed:2023-06-06T09:55:06.339Z

# create a hierarchy
Folder_create(v1, 'source/code')
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> folder "vsim-dev:_an_upload:/source/code" (application/vnd.solvebio.folder) nb:0 user:Karl Forner accessed:2023-06-06T09:55:06.838Z

# List folders of a given vault - recursive
Folders(vault_id = v1)
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> EDP List of 4 Objects
#>           path object_type vault_name   user_name            last_modified
#> 1        /data      folder _an_upload Karl Forner 2023-06-06T09:55:06.147Z
#> 2      /data-1      folder _an_upload Karl Forner 2023-06-06T09:55:06.339Z
#> 3      /source      folder _an_upload Karl Forner 2023-06-06T09:55:06.656Z
#> 4 /source/code      folder _an_upload Karl Forner 2023-06-06T09:55:06.838Z

# List folders using regex on paths - recursive
Folders(regex = '^/data')
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> EDP List of 2 Objects
#>      path object_type vault_name   user_name            last_modified
#> 1   /data      folder _an_upload Karl Forner 2023-06-06T09:55:06.147Z
#> 2 /data-1      folder _an_upload Karl Forner 2023-06-06T09:55:06.339Z
Folders(regex = 'code$')
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> EDP List of 2 Objects
#>           path object_type vault_name    user_name            last_modified
#> 1     /GENCODE      folder     Public David Caplan 2022-11-13T01:24:41.193Z
#> 2 /source/code      folder _an_upload  Karl Forner 2023-06-06T09:55:06.838Z

# List folders matching paths - recursive
Folders(query = 'code')
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> EDP List of 1 Objects
#>           path object_type vault_name   user_name            last_modified
#> 1 /source/code      folder _an_upload Karl Forner 2023-06-06T09:55:06.838Z

# fetch a a folder from a given vault
Folder(vault_id = v1, path = 'data')
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> folder "vsim-dev:_an_upload:/data" (application/vnd.solvebio.folder) nb:0 user:Karl Forner accessed:2023-06-06T09:55:06.147Z

# fetch a folder with its full.path
Folder(full_path = fdata$full_path)
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> folder "vsim-dev:_an_upload:/data" (application/vnd.solvebio.folder) nb:0 user:Karl Forner accessed:2023-06-06T09:55:06.147Z
delete(v1)
#> DELETE https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults/2668...
#> Vault "vsim-dev:_an_upload" ("upload", general, rwa), @ "Karl Forner", updated at:2023-06-06T09:55:08.315Z

Files

Upload and Download

upload CSV

irisp <- file.path(tempdir(), 'iris.csv')
write.csv(iris[1:10,], irisp)

v <- Vault_create('_iris_upload',  description = 'upload', 
  tags = list('iris', 'can_be_removed'))
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults...

vpath <- 'data/iris/v1/iris_10.csv'

# File upload: folder hierarchy is created on the fly
firis <- File_upload(v, irisp, vpath )
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> uploading file /tmp/RtmpeUkAj5/iris.csv...
firis$full_path
#> [1] "vsim-dev:_iris_upload:/data/iris/v1/iris_10.csv"
firis$path
#> [1] "/data/iris/v1/iris_10.csv"
firis$md5
#> [1] "14f700477f0105443bd294683f270a58"

# File download
res_down <- File_download(firis, file.path(dirname(irisp), 'iris_10_download.csv'))
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects/2060147415064509835/download...
dir(dirname(irisp), 'iris')
#> [1] "iris.csv"             "iris_10_download.csv"

# fetch a File object
File( full_path = firis$full_path)
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> file "vsim-dev:_iris_upload:/data/iris/v1/iris_10.csv" (text/csv) nb:0 user:Karl Forner accessed:2023-06-06T09:55:11.032Z
firis
#> file "vsim-dev:_iris_upload:/data/iris/v1/iris_10.csv" (text/csv) nb:0 user:Karl Forner accessed:2023-06-06T09:55:09.713Z

upload JSONL

The JSON Lines text format, also called newline-delimited JSON. JSON Lines is a convenient format for storing structured data that may be processed one record at a time. * JSONL or ndjson)


  # upload jsonl file
irisj <- file.path(tempdir(), "iris.json")

# stream_out function write the JSONL format to a connection
jsonlite:::stream_out(iris[1:15, ], file(irisj))
#> opening file output connection.
#> 
Complete! Processed total of 15 rows.
#> closing file output connection.

vpath <- "v1/iris_10.json"
fi2 <- File_upload(v, irisj, vpath)
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> uploading file /tmp/RtmpeUkAj5/iris.json...
fi2$path
#> [1] "/v1/iris_10.json"
fi2$mimetype
#> [1] "application/json"

File_query(fi2)
#> fetching the number of rows of the file...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects/2060147433357145394...
#> found  lines
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects/2060147433357145394/data...
#>    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1           5.1         3.5          1.4         0.2  setosa
#> 2           4.9         3.0          1.4         0.2  setosa
#> 3           4.7         3.2          1.3         0.2  setosa
#> 4           4.6         3.1          1.5         0.2  setosa
#> 5           5.0         3.6          1.4         0.2  setosa
#> 6           5.4         3.9          1.7         0.4  setosa
#> 7           4.6         3.4          1.4         0.3  setosa
#> 8           5.0         3.4          1.5         0.2  setosa
#> 9           4.4         2.9          1.4         0.2  setosa
#> 10          4.9         3.1          1.5         0.1  setosa
#> 11          5.4         3.7          1.5         0.2  setosa
#> 12          4.8         3.4          1.6         0.2  setosa
#> 13          4.8         3.0          1.4         0.1  setosa
#> 14          4.3         3.0          1.1         0.1  setosa
#> 15          5.8         4.0          1.2         0.2  setosa
File_query(fi2, filters = filters("Sepal.Length >= 5"))
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects/2060147433357145394/data...
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1          5.1         3.5          1.4         0.2  setosa
#> 2          5.0         3.6          1.4         0.2  setosa
#> 3          5.4         3.9          1.7         0.4  setosa
#> 4          5.0         3.4          1.5         0.2  setosa
#> 5          5.4         3.7          1.5         0.2  setosa
#> 6          5.8         4.0          1.2         0.2  setosa

Files Listing

Works as vaults and folders using the same criteria for filtering and the same fetch_next, fetch_all functions.

all_files <- as.data.frame(Files())
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...

Files(limit = 3)
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> EDP List of 3 Objects
#>                                                                                                        path object_type      vault_name    user_name            last_modified
#> 1                        /ClinGen/ClinGen-1-0-0-2015-06-02-Genes-1098056416333778650-20230215200640.json.gz        file Public Data Dev David Caplan 2023-02-15T20:07:46.945Z
#> 2 /ClinVar/5.2.0-20210110/ClinVar-5-2-0-20210110-Variants-GRCH37-1425664822266145048-20221110194518.json.gz        file          Public David Caplan 2022-11-10T19:47:23.197Z
#> 3 /ClinVar/5.2.0-20210110/ClinVar-5-2-0-20210110-Variants-GRCH38-1425664716292714398-20221110194523.json.gz        file          Public David Caplan 2022-11-10T19:47:23.188Z

Files(regex = 'TCG*')
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> EDP List of 2 Objects
#>                                                                              path object_type vault_name           user_name            last_modified
#> 1 /TCGA/2.0.0-2018-mc3-v0.2.8/TCGA-mc3.v0.2.8.PUBLIC.LAML_PATCH.maf.oncokb.txt.gz        file     Public        David Caplan 2023-03-13T16:22:09.865Z
#> 2                          /TCGA/2.0.0-2018-mc3-v0.2.8/TCGA_Patients_Diseases.txt        file     Public Tatjana Damnjanovic 2023-01-18T12:07:22.391Z

Files(regex = 'TCG*')
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> EDP List of 2 Objects
#>                                                                              path object_type vault_name           user_name            last_modified
#> 1 /TCGA/2.0.0-2018-mc3-v0.2.8/TCGA-mc3.v0.2.8.PUBLIC.LAML_PATCH.maf.oncokb.txt.gz        file     Public        David Caplan 2023-03-13T16:22:09.865Z
#> 2                          /TCGA/2.0.0-2018-mc3-v0.2.8/TCGA_Patients_Diseases.txt        file     Public Tatjana Damnjanovic 2023-01-18T12:07:22.391Z

# files from a given vault
Files(vault_id = v)
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> EDP List of 2 Objects
#>                        path object_type   vault_name   user_name            last_modified
#> 1 /data/iris/v1/iris_10.csv        file _iris_upload Karl Forner 2023-06-06T09:55:09.713Z
#> 2          /v1/iris_10.json        file _iris_upload Karl Forner 2023-06-06T09:55:11.894Z

# files from a given folder
dirs_df <- suppressMessages(as.data.frame(Folders()))
dplyr::filter(dirs_df,  path == '/GENCODE/3.0.0-42')
#>   account_id ancestor_object_ids availability class_name               created_at dataset_description dataset_documents_count documents_count dataset_full_name storage_class dataset_id depth
#> 1          2 1911311459582295135         NULL     Object 2022-11-13T01:24:41.387Z                NULL                    NULL            NULL              NULL          NULL       NULL     1
#>   description filename                          full_path has_children has_folder_children                  id is_deleted is_transformable            last_accessed            last_modified  md5
#> 1        NULL 3.0.0-42 quartzbio:Public:/GENCODE/3.0.0-42         TRUE               FALSE 1911311461138976190      FALSE            FALSE 2022-11-13T01:24:41.387Z 2022-11-13T01:24:41.387Z NULL
#>   metadata                        mimetype num_children num_descendants object_type    parent_object_id              path size tags               updated_at upload_url  url
#> 1     NULL application/vnd.solvebio.folder            4               4      folder 1911311459582295135 /GENCODE/3.0.0-42 NULL NULL 2023-05-23T12:41:09.998Z       NULL NULL
#>                                                                                               user user_id vault_id vault_name version_count global_beacon
#> 1 User, david.caplan@precisionformedicine.com, David, David Caplan, 2, TRUE, TRUE, primary_owner,        2       19     Public             0          NULL
fo <- Folder(full_path = 'quartzbio:Public:/GENCODE/3.0.0-42')
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
fo
#> folder "quartzbio:Public:/GENCODE/3.0.0-42" (application/vnd.solvebio.folder) nb:0 user:David Caplan accessed:2022-11-13T01:24:41.387Z
Files(ancestor_id = fo$id)
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> EDP List of 2 Objects
#>                                                    path object_type vault_name    user_name            last_modified
#> 1       /GENCODE/3.0.0-42/gencode.v42.annotation.gtf.gz        file     Public David Caplan 2022-11-13T02:21:57.723Z
#> 2 /GENCODE/3.0.0-42/gencode.v42lift37.annotation.gtf.gz        file     Public David Caplan 2022-11-13T02:23:12.714Z

Files Move

# move file from v1/ to v2/
f1 <- Folder_create(vault_id=v, '/data/iris/v2')
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
firis_v2 <- Object_update(firis, parent_object_id = f1$id)
#> PUT https://vsim-dev.api.edp.aws.quartz.bio/v2/objects/2060147415064509835...
firis_v2
#> file "vsim-dev:_iris_upload:/data/iris/v2/iris_10.csv" (text/csv) nb:10 user:Karl Forner accessed:2023-06-06T09:55:11.032Z

Files Querying

Columns, Rows of objects that are tabular can be obtained

# fetch the two first rows
f2r <- File_query(firis_v2, limit=2)
#> fetching the number of rows of the file...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects/2060147415064509835...
#> found 10 lines
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects/2060147415064509835/data...

# fetch the two next ones
fetch_next(f2r)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects/2060147415064509835/data...
#>     Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 3          4.7         3.2          1.3         0.2  setosa
#> 2 4          4.6         3.1          1.5         0.2  setosa

# fetch row 8
File_query(firis_v2, limit=1,  offset=7)
#> fetching the number of rows of the file...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects/2060147415064509835...
#> found 10 lines
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects/2060147415064509835/data...
#>     Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 8            5         3.4          1.5         0.2  setosa
iris[8,]
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 8            5         3.4          1.5         0.2  setosa

# fetch all Setosa 
File_query(firis_v2, filters= filters('Species contains "setosa"'))
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects/2060147415064509835/data...
#>       Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1   1          5.1         3.5          1.4         0.2  setosa
#> 2   2          4.9           3          1.4         0.2  setosa
#> 3   3          4.7         3.2          1.3         0.2  setosa
#> 4   4          4.6         3.1          1.5         0.2  setosa
#> 5   5            5         3.6          1.4         0.2  setosa
#> 6   6          5.4         3.9          1.7         0.4  setosa
#> 7   7          4.6         3.4          1.4         0.3  setosa
#> 8   8            5         3.4          1.5         0.2  setosa
#> 9   9          4.4         2.9          1.4         0.2  setosa
#> 10 10          4.9         3.1          1.5         0.1  setosa
File_query(firis_v2, filters= filters('Sepal.Length = 4.7'))
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects/2060147415064509835/data...
#> data frame with 0 columns and 0 rows

Datasets

Create and Import from a data.frame

iris_2 <- Dataset_create(v, 'iris_2.ds')
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
iris_2$full_path
#> [1] "vsim-dev:_iris_upload:/iris_2.ds"
iris_2$object_type
#> [1] "dataset"

import_res <- Dataset_import(iris_2, df = iris[1:12, ], sync = TRUE)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/dataset_imports...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060147492615808236...
#> waiting for task 2060147492615808236 ("Dataset Import" for "vsim-dev:_iris_upload:/iris_2.ds"),  29 retries left
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060147492615808236...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060147504343542628...
#> waiting for task 2060147504343542628 ("Dataset Commit" for "vsim-dev:_iris_upload:/iris_2.ds"),  28 retries left
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060147504343542628...
#> waiting for task 2060147504343542628 ("Dataset Commit" for "vsim-dev:_iris_upload:/iris_2.ds"),  27 retries left
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060147504343542628...
#> waiting for task 2060147504343542628 ("Dataset Commit" for "vsim-dev:_iris_upload:/iris_2.ds"),  26 retries left
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060147504343542628...
#> waiting for task 2060147504343542628 ("Dataset Commit" for "vsim-dev:_iris_upload:/iris_2.ds"),  25 retries left
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060147504343542628...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks...
iris_2 <- Dataset(vault_id = v, path = iris_2$path)
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
i2 <- Dataset_query(iris_2, limit = 1)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147488315883162/data...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147488315883162/fields...
#> took 0.161000000000001 to process the meta data.
fetch_next(i2)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147488315883162/data...
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1          4.9           3          1.4         0.2  setosa

Create and Import from an EDP File object

  
  # prepare the file to upload
  MTCARS <- mtcars[1:4, 1:5 ]

  # write it in a local file
  local_path <- file.path(tempdir(), "mtcars.csv")
  write.csv(MTCARS, local_path, row.names = FALSE)

  # upload it
  fi <- suppressMessages(File_upload(v, local_path, "/a/b/c/mtcars.csv"))
  
  # create the dataset (empty)
  ds <- Dataset_create(v, "mtcars.ds")
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
  # import the EDP File to the Dataset
  suppressMessages(imp <- Dataset_import(ds, file_id = fi, sync =TRUE))
#> Warning in Dataset_import(ds, file_id = fi, sync = TRUE): got timeout while waiting for dataset import task completion: 2060147658965121373

  # with meta set to FALSE some reordering may appear in columns
  df <- Dataset_query(ds, meta = FALSE, limit = 5000)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147655134713188/data...
  df[1:3, ]
#>   disp  mpg  hp drat cyl
#> 1  160 21.0 110 3.90   6
#> 2  160 21.0 110 3.90   6
#> 3  108 22.8  93 3.85   4

  df_with_meta <-  Dataset_query(ds, meta = TRUE)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147655134713188/data...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147655134713188/fields...
#> took 0.167000000000002 to process the meta data.
  df_with_meta[1:3, ]
#>    mpg cyl disp  hp drat
#> 1 21.0   6  160 110 3.90
#> 2 21.0   6  160 110 3.90
#> 3 22.8   4  108  93 3.85

Create and Import from a record lists

genes_1 <- Dataset_create(v, 'genes.ds', 
  description = "genes hits",
  metadata = list(DEV = TRUE), 
  tags = list("QBP", "EDP"), 
  storage_class = "Temporary", capacity = "small")
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...

records <- list(
  list(gene = "CFTR", importance = 1, sample_count = 2104),
  list(gene = "BRCA2", importance = 1, sample_count = 1391),
  list(gene = "CLIC2", importance = 5, sample_count = 14)
)

import_res <- Dataset_import(genes_1, records = records, sync = TRUE)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/dataset_imports...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060148482955668768...
#> waiting for task 2060148482955668768 ("Dataset Import" for "vsim-dev:_iris_upload:/genes.ds"),  29 retries left
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060148482955668768...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060148492685464999...
#> waiting for task 2060148492685464999 ("Dataset Commit" for "vsim-dev:_iris_upload:/genes.ds"),  28 retries left
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060148492685464999...
#> waiting for task 2060148492685464999 ("Dataset Commit" for "vsim-dev:_iris_upload:/genes.ds"),  27 retries left
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060148492685464999...
#> waiting for task 2060148492685464999 ("Dataset Commit" for "vsim-dev:_iris_upload:/genes.ds"),  26 retries left
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060148492685464999...
#> waiting for task 2060148492685464999 ("Dataset Commit" for "vsim-dev:_iris_upload:/genes.ds"),  25 retries left
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060148492685464999...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks...
g1 <- Dataset_query(genes_1, limit = 1)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060148478010548922/data...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060148478010548922/fields...
#> took 0.162000000000006 to process the meta data.
g1
#>   importance gene sample_count
#> 1          1 CFTR         2104
fetch_all(g1)
#>   importance  gene sample_count
#> 1          1  CFTR         2104
#> 2          1 BRCA2         1391
#> 3          5 CLIC2           14
Dataset_query(genes_1, filters = filters('sample_count <= 14'))
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060148478010548922/data...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060148478010548922/fields...
#> took 0.157999999999987 to process the meta data.
#>   importance  gene sample_count
#> 1          5 CLIC2           14

Modify records with python expression

nobs <- Dataset_create(v, 'dna_gurus.ds')
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
authors <- list(
    list(name='Francis Crick'),
    list(name='James Watson'),
    list(name='Rosalind Franklin')
)
# additional firt and last name fields to be created
target_fields <- list(
  list(
    name="first_name",
    description="Adds a first name column based on name column",
    data_type="string",
    expression="record.name.split(' ')[0]"
  ),
  list(
    name="last_name",
    description="Adds a last name column based on name column",
    data_type="string",
    expression="record.name.split(' ')[-1]"
  )
)
res <- Dataset_import(nobs, records = authors, 
  target_fields = target_fields, 
  sync = TRUE)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/dataset_imports...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060148639020092336...
#> waiting for task 2060148639020092336 ("Dataset Import" for "vsim-dev:_iris_upload:/dna_gurus.ds"),  29 retries left
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060148639020092336...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060148649900332273...
#> waiting for task 2060148649900332273 ("Dataset Commit" for "vsim-dev:_iris_upload:/dna_gurus.ds"),  28 retries left
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060148649900332273...
#> waiting for task 2060148649900332273 ("Dataset Commit" for "vsim-dev:_iris_upload:/dna_gurus.ds"),  27 retries left
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060148649900332273...
#> waiting for task 2060148649900332273 ("Dataset Commit" for "vsim-dev:_iris_upload:/dna_gurus.ds"),  26 retries left
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060148649900332273...
#> waiting for task 2060148649900332273 ("Dataset Commit" for "vsim-dev:_iris_upload:/dna_gurus.ds"),  25 retries left
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060148649900332273...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks...
Dataset_query(nobs)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060148634457461929/data...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060148634457461929/fields...
#> took 0.248999999999995 to process the meta data.
#>   first_name last_name              name
#> 1    Francis     Crick     Francis Crick
#> 2      James    Watson      James Watson
#> 3   Rosalind  Franklin Rosalind Franklin

Dataset Query

# fetch the first row
f1r_ds <- Dataset_query(iris_2, limit=1)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147488315883162/data...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147488315883162/fields...
#> took 0.183999999999997 to process the meta data.

fetch_next(f1r_ds)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147488315883162/data...
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1          4.9           3          1.4         0.2  setosa

fetch_all(f1r_ds)
#>    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1           5.1         3.5          1.4         0.2  setosa
#> 2           4.9         3.0          1.4         0.2  setosa
#> 3           4.7         3.2          1.3         0.2  setosa
#> 4           4.6         3.1          1.5         0.2  setosa
#> 5           5.0         3.6          1.4         0.2  setosa
#> 6           5.4         3.9          1.7         0.4  setosa
#> 7           4.6         3.4          1.4         0.3  setosa
#> 8           5.0         3.4          1.5         0.2  setosa
#> 9           4.4         2.9          1.4         0.2  setosa
#> 10          4.9         3.1          1.5         0.1  setosa
#> 11          5.4         3.7          1.5         0.2  setosa
#> 12          4.8         3.4          1.6         0.2  setosa

# Filters acts on column fields that matches the data.frame import.
Dataset_query(iris_2, filters= filters('Species contains "setosa"'))
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147488315883162/data...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147488315883162/fields...
#> took 0.169000000000011 to process the meta data.
#>    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1           5.1         3.5          1.4         0.2  setosa
#> 2           4.9         3.0          1.4         0.2  setosa
#> 3           4.7         3.2          1.3         0.2  setosa
#> 4           4.6         3.1          1.5         0.2  setosa
#> 5           5.0         3.6          1.4         0.2  setosa
#> 6           5.4         3.9          1.7         0.4  setosa
#> 7           4.6         3.4          1.4         0.3  setosa
#> 8           5.0         3.4          1.5         0.2  setosa
#> 9           4.4         2.9          1.4         0.2  setosa
#> 10          4.9         3.1          1.5         0.1  setosa
#> 11          5.4         3.7          1.5         0.2  setosa
#> 12          4.8         3.4          1.6         0.2  setosa

Dataset_query(iris_2, filters= filters('Sepal.Length >= 5.1'))
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147488315883162/data...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147488315883162/fields...
#> took 0.171000000000021 to process the meta data.
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1          5.1         3.5          1.4         0.2  setosa
#> 2          5.4         3.9          1.7         0.4  setosa
#> 3          5.4         3.7          1.5         0.2  setosa

Dataset_query(iris_2, filters= filters('(Sepal.Length >= 5.1) OR  (Sepal.Width <= 3.0)'))
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147488315883162/data...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147488315883162/fields...
#> took 0.163000000000011 to process the meta data.
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1          5.1         3.5          1.4         0.2  setosa
#> 2          4.9         3.0          1.4         0.2  setosa
#> 3          5.4         3.9          1.7         0.4  setosa
#> 4          4.4         2.9          1.4         0.2  setosa
#> 5          5.4         3.7          1.5         0.2  setosa

# Keep fields
Dataset_query(iris_2, 
  fields = c('Petal.Width', 'Species'),
  filters= filters('(Sepal.Length >= 5.1) OR  (Sepal.Width <= 3.0)'))
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147488315883162/data...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147488315883162/fields...
#> took 0.168999999999983 to process the meta data.
#>   Petal.Width Species
#> 1         0.2  setosa
#> 2         0.2  setosa
#> 3         0.4  setosa
#> 4         0.2  setosa
#> 5         0.2  setosa

# Exclude fields
Dataset_query(iris_2, 
  exclude_fields = c('Petal.Width', 'Species'),
  filters= filters('(Sepal.Length >= 5.1) OR  (Sepal.Width <= 3.0)'))
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147488315883162/data...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147488315883162/fields...
#> took 0.222000000000008 to process the meta data.
#>   Sepal.Length Sepal.Width Petal.Length             _commit                  _id
#> 1          5.1         3.5          1.4 2060147502872920434 AYiQIeU-RgPV1mL7vTiW
#> 2          4.9         3.0          1.4 2060147502872920434 AYiQIeU-RgPV1mL7vTiX
#> 3          5.4         3.9          1.7 2060147502872920434 AYiQIeU-RgPV1mL7vTib
#> 4          4.4         2.9          1.4 2060147502872920434 AYiQIeU-RgPV1mL7vTie
#> 5          5.4         3.7          1.5 2060147502872920434 AYiQIeU-RgPV1mL7vTig

# orginal data order
ds <- Dataset(full_path =  "quartzbio:Public:/TCGA/2.0.0-2018-mc3-v0.2.8/PatientsDiseases")
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
df <- Dataset_query(ds, limit=2000, meta = TRUE)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/1959470796697098463/data...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/1959470796697098463/fields...
#> took 0.161000000000001 to process the meta data.
head(df)
#>   PATIENT_BARCODE  SAMPLE_BARCODE DISEASE        SUBTYPE
#> 1    TCGA-OR-A5J1 TCGA-OR-A5J1-01     ACC Not_Applicable
#> 2    TCGA-OR-A5J2 TCGA-OR-A5J2-01     ACC Not_Applicable
#> 3    TCGA-OR-A5J3 TCGA-OR-A5J3-01     ACC Not_Applicable
#> 4    TCGA-OR-A5J5 TCGA-OR-A5J5-01     ACC Not_Applicable
#> 5    TCGA-OR-A5J6 TCGA-OR-A5J6-01     ACC Not_Applicable
#> 6    TCGA-OR-A5J7 TCGA-OR-A5J7-01     ACC Not_Applicable

# sort by Fields
fields <- DatasetFields(ds)
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/1959470796697098463/fields...
dfo <- Dataset_query(ds, ordering = c('patient_barcode', 'sample_barcode'), limit=2000, meta = TRUE)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/1959470796697098463/data...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/1959470796697098463/fields...
#> took 0.164000000000016 to process the meta data.
head(dfo)
#>   PATIENT_BARCODE  SAMPLE_BARCODE DISEASE          SUBTYPE
#> 1    TCGA-02-0047 TCGA-02-0047-01     GBM            IDHwt
#> 2    TCGA-02-0055 TCGA-02-0055-01     GBM            IDHwt
#> 3    TCGA-02-2483 TCGA-02-2483-01     GBM IDHmut-non-codel
#> 4    TCGA-02-2485 TCGA-02-2485-01     GBM            IDHwt
#> 5    TCGA-02-2486 TCGA-02-2486-01     GBM            IDHwt
#> 6    TCGA-04-1348 TCGA-04-1348-01      OV   Not_Applicable


delete(v)
#> DELETE https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults/2669...
#> Vault "vsim-dev:_iris_upload" ("upload", general, rwa), @ "Karl Forner", updated at:2023-06-06T09:57:58.526Z