Vaults, Files, Datasets and Objects
vaults_and_objects.Rmd
TL;DR
library(quartzbio.edp, quiet = TRUE)
Sys.getenv('EDP_PROFILE')
#> [1] ""
Sys.setenv(EDP_PROFILE = 'vsim-dev_rw')
Sys.getenv('EDP_PROFILE')
#> [1] "vsim-dev_rw"
vlts <- Vaults()
#> Loading required package: RcppSimdJson
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v1/user...
#> Connected to https://vsim-dev.api.edp.aws.quartz.bio with user "Karl Forner" using an API Token
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults...
u <- User()
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v1/user...
u$first_name
#> [1] "Karl"
vme <- Vault_create(paste0('vault_', u$first_name))
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults...
Vault(id = vme)
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults/2666...
#> Vault "vsim-dev:vault_Karl" ("", general, RWA), @ "Karl Forner", updated at:2023-06-06T09:54:59.653Z
Folder_create(vme, 'data/cyto')
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> folder "vsim-dev:vault_Karl:/data/cyto" (application/vnd.solvebio.folder) nb:0 user:Karl Forner accessed:2023-06-06T09:55:00.418Z
Folder_create(vme, 'data/flow')
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> folder "vsim-dev:vault_Karl:/data/flow" (application/vnd.solvebio.folder) nb:0 user:Karl Forner accessed:2023-06-06T09:55:00.751Z
Folder_create(vme, 'source/code')
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> folder "vsim-dev:vault_Karl:/source/code" (application/vnd.solvebio.folder) nb:0 user:Karl Forner accessed:2023-06-06T09:55:01.261Z
Folders(vault_id=vme)
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> EDP List of 5 Objects
#> path object_type vault_name user_name last_modified
#> 1 /data folder vault_Karl Karl Forner 2023-06-06T09:55:00.189Z
#> 2 /data/cyto folder vault_Karl Karl Forner 2023-06-06T09:55:00.418Z
#> 3 /data/flow folder vault_Karl Karl Forner 2023-06-06T09:55:00.751Z
#> 4 /source folder vault_Karl Karl Forner 2023-06-06T09:55:01.079Z
#> 5 /source/code folder vault_Karl Karl Forner 2023-06-06T09:55:01.261Z
delete(vme)
#> DELETE https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults/2666...
#> Vault "vsim-dev:vault_Karl" ("", general, rwa), @ "Karl Forner", updated at:2023-06-06T09:55:01.660Z
User
# select an EDP instance using a profile
u <- User()
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v1/user...
u$first_name
#> [1] "Karl"
u$last_name
#> [1] "Forner"
u$full_name
#> [1] "Karl Forner"
u$id
#> [1] 51
u$email
#> [1] "karl.forner@precisionformedicine.com"
u$account$name
#> [1] "vsim-dev"
u$url
#> [1] "https://vsim-dev.api.edp.aws.quartz.bio/v1/users/51"
Vaults
# select an EDP instance using a profile
Sys.setenv(EDP_PROFILE = 'vsim-dev_rw')
# fetch personnal vault info
myV <- Vault()
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v1/user...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults...
# personal vault: user-id
myV$name
#> [1] "user-51"
myV$full_path
#> [1] "vsim-dev:user-51"
myV$has_children
#> [1] TRUE
myV$has_folder_children
#> [1] TRUE
myV$user$full_name
#> [1] "Karl Forner"
myV$permissions
#> $read
#> [1] TRUE
#>
#> $write
#> [1] TRUE
#>
#> $admin
#> [1] FALSE
Vault creation
# create a vault
v <- Vault_create('vault_test_1', description = 'test_1', tags = list('tag1', 'tag2'))
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults...
v$name
#> [1] "vault_test_1"
v$full_path
#> [1] "vsim-dev:vault_test_1"
# retrieve a vault by name
Vault(name = 'vault_test_1')
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults...
#> Vault "vsim-dev:vault_test_1" ("test_1", general, RWA), @ "Karl Forner", updated at:2023-06-06T09:55:02.501Z
# retrieve a vault by full_path
Vault(full_path = v$full_path)
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults...
#> Vault "vsim-dev:vault_test_1" ("test_1", general, RWA), @ "Karl Forner", updated at:2023-06-06T09:55:02.501Z
v$description
#> [1] "test_1"
v$metadata
#> NULL
v$tags
#> [[1]]
#> [1] "tag1"
#>
#> [[2]]
#> [1] "tag2"
# update metadata
new_name <- 'test_one'
v2 <- Vault_update(v,
name = new_name,
description = 'barabor',
metadata = list(meta_1 = 'foo'),
storage_class = 'Performance', tags = 'tag_A')
#> PUT https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults/2667...
v3 <- update(v2, storage_class = 'Temporary')
#> PUT https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults/2667...
Vaults listing, limit and paging
- Vaults are ordered per locale “LC_TIME”, “us”
- Intermediate pages can sized using limit
- fetch_next and fetch_all use that limit
# get the firt two ordered vaults
vs1 <- Vaults(limit = 2, page = 1)
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults...
# get the the third and fourth vault
Vaults(limit = 2, page = 2)
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults...
#> List of 2 Vaults
#> id name full_path user_name vault_type created_at
#> 1 19 Public quartzbio:Public Admin Dev user general 2022-11-02T23:38:43.735Z
#> 2 1300 Public Data Dev vsim-dev:Public Data Dev David Caplan general 2023-02-15T20:07:31.859Z
# same as above
vs2 <- fetch_next(vs1)
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults...
vs2
#> List of 2 Vaults
#> id name full_path user_name vault_type created_at
#> 1 19 Public quartzbio:Public Admin Dev user general 2022-11-02T23:38:43.735Z
#> 2 1300 Public Data Dev vsim-dev:Public Data Dev David Caplan general 2023-02-15T20:07:31.859Z
# fetch all remaining vaults by pages of size 2
all_vlts <- fetch_all(vs1)
all_vlts_df <- as.data.frame(all_vlts)
# delete vault
delete(v)
#> DELETE https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults/2667...
#> Vault "vsim-dev:test_one" ("barabor", general, rwa), @ "Karl Forner", updated at:2023-06-06T09:55:04.616Z
Folders
# list all folders in an account, recursive by default
all_folders <- Folders()
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
# control depth
suppressMessages(Folders(depth=0))
#> EDP List of 18 Objects
#> path object_type vault_name user_name last_modified
#> 1 /ClinGen folder Public Data Dev David Caplan 2023-02-15T20:07:39.539Z
#> 2 /ClinVar folder Public David Caplan 2022-11-10T19:46:56.464Z
#> 3 /Dandan Testing CTMS folder ICF Dandan Xu 2023-02-03T01:11:53.077Z
#> 4 /dir1 folder user-51 Karl Forner 2023-03-02T14:02:17.545Z
#> 5 /GENCODE folder Public David Caplan 2022-11-13T01:24:41.193Z
#> 6 /HGNC folder Public David Caplan 2022-11-02T23:42:11.247Z
#> 7 /MEDLINE folder Public David Caplan 2022-11-03T00:22:13.193Z
#> 8 /recursive folder user-51 Karl Forner 2023-03-02T14:56:40.353Z
#> 9 /r_examples folder user-51 Karl Forner 2023-03-17T10:31:37.489Z
#> 10 /Study ICF Templates folder ICF Dandan Xu 2023-02-03T00:52:20.747Z
#> 11 /TCGA folder Public Tatjana Damnjanovic 2023-01-18T12:07:10.392Z
#> 12 /v1 folder user-51 Karl Forner 2023-05-11T08:44:15.267Z
suppressMessages(Folders(depth=1))
#> EDP List of 11 Objects
#> path object_type vault_name user_name last_modified
#> 1 /ClinVar/5.2.0-20210110 folder Public David Caplan 2022-11-10T19:47:11.142Z
#> 2 /ClinVar/5.2.0-20221105 folder Public David Caplan 2022-11-10T20:32:55.867Z
#> 3 /dir1/dir2 folder user-51 Karl Forner 2023-03-02T14:02:17.736Z
#> 4 /GENCODE/3.0.0-42 folder Public David Caplan 2022-11-13T01:24:41.387Z
#> 5 /HGNC/3.3.1-2021-08-25 folder Public David Caplan 2022-11-02T23:45:22.368Z
#> 6 /MEDLINE/2.3.4-2018 folder Public David Caplan 2022-11-03T00:22:26.392Z
#> 7 /recursive/folder folder user-51 Karl Forner 2023-03-02T14:56:40.536Z
#> 8 /TCGA/2.0.0-2018-mc3-v0.2.8 folder Public Tatjana Damnjanovic 2023-01-18T14:41:02.586Z
# get first four
Folders(limit = 4)
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> EDP List of 4 Objects
#> path object_type vault_name user_name last_modified
#> 1 /ClinGen folder Public Data Dev David Caplan 2023-02-15T20:07:39.539Z
#> 2 /ClinVar folder Public David Caplan 2022-11-10T19:46:56.464Z
#> 3 /ClinVar/5.2.0-20210110 folder Public David Caplan 2022-11-10T19:47:11.142Z
#> 4 /ClinVar/5.2.0-20221105 folder Public David Caplan 2022-11-10T20:32:55.867Z
# create folder with description and tags
v1 <- Vault_create('_an_upload', description = 'upload', tags = list('fake', 'can_be_removed'))
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults...
fdata <- Folder_create(v1, 'data')
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
fdata
#> folder "vsim-dev:_an_upload:/data" (application/vnd.solvebio.folder) nb:0 user:Karl Forner accessed:2023-06-06T09:55:06.147Z
# CAUTION, no overwritting per default, folders are renamed incrementally
# a new folder data-1 is created
Folder_create(v1, 'data')
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> folder "vsim-dev:_an_upload:/data-1" (application/vnd.solvebio.folder) nb:0 user:Karl Forner accessed:2023-06-06T09:55:06.339Z
# create a hierarchy
Folder_create(v1, 'source/code')
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> folder "vsim-dev:_an_upload:/source/code" (application/vnd.solvebio.folder) nb:0 user:Karl Forner accessed:2023-06-06T09:55:06.838Z
# List folders of a given vault - recursive
Folders(vault_id = v1)
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> EDP List of 4 Objects
#> path object_type vault_name user_name last_modified
#> 1 /data folder _an_upload Karl Forner 2023-06-06T09:55:06.147Z
#> 2 /data-1 folder _an_upload Karl Forner 2023-06-06T09:55:06.339Z
#> 3 /source folder _an_upload Karl Forner 2023-06-06T09:55:06.656Z
#> 4 /source/code folder _an_upload Karl Forner 2023-06-06T09:55:06.838Z
# List folders using regex on paths - recursive
Folders(regex = '^/data')
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> EDP List of 2 Objects
#> path object_type vault_name user_name last_modified
#> 1 /data folder _an_upload Karl Forner 2023-06-06T09:55:06.147Z
#> 2 /data-1 folder _an_upload Karl Forner 2023-06-06T09:55:06.339Z
Folders(regex = 'code$')
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> EDP List of 2 Objects
#> path object_type vault_name user_name last_modified
#> 1 /GENCODE folder Public David Caplan 2022-11-13T01:24:41.193Z
#> 2 /source/code folder _an_upload Karl Forner 2023-06-06T09:55:06.838Z
# List folders matching paths - recursive
Folders(query = 'code')
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> EDP List of 1 Objects
#> path object_type vault_name user_name last_modified
#> 1 /source/code folder _an_upload Karl Forner 2023-06-06T09:55:06.838Z
# fetch a a folder from a given vault
Folder(vault_id = v1, path = 'data')
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> folder "vsim-dev:_an_upload:/data" (application/vnd.solvebio.folder) nb:0 user:Karl Forner accessed:2023-06-06T09:55:06.147Z
# fetch a folder with its full.path
Folder(full_path = fdata$full_path)
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> folder "vsim-dev:_an_upload:/data" (application/vnd.solvebio.folder) nb:0 user:Karl Forner accessed:2023-06-06T09:55:06.147Z
delete(v1)
#> DELETE https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults/2668...
#> Vault "vsim-dev:_an_upload" ("upload", general, rwa), @ "Karl Forner", updated at:2023-06-06T09:55:08.315Z
Files
Upload and Download
upload CSV
irisp <- file.path(tempdir(), 'iris.csv')
write.csv(iris[1:10,], irisp)
v <- Vault_create('_iris_upload', description = 'upload',
tags = list('iris', 'can_be_removed'))
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults...
vpath <- 'data/iris/v1/iris_10.csv'
# File upload: folder hierarchy is created on the fly
firis <- File_upload(v, irisp, vpath )
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> uploading file /tmp/RtmpeUkAj5/iris.csv...
firis$full_path
#> [1] "vsim-dev:_iris_upload:/data/iris/v1/iris_10.csv"
firis$path
#> [1] "/data/iris/v1/iris_10.csv"
firis$md5
#> [1] "14f700477f0105443bd294683f270a58"
# File download
res_down <- File_download(firis, file.path(dirname(irisp), 'iris_10_download.csv'))
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects/2060147415064509835/download...
dir(dirname(irisp), 'iris')
#> [1] "iris.csv" "iris_10_download.csv"
# fetch a File object
File( full_path = firis$full_path)
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> file "vsim-dev:_iris_upload:/data/iris/v1/iris_10.csv" (text/csv) nb:0 user:Karl Forner accessed:2023-06-06T09:55:11.032Z
firis
#> file "vsim-dev:_iris_upload:/data/iris/v1/iris_10.csv" (text/csv) nb:0 user:Karl Forner accessed:2023-06-06T09:55:09.713Z
upload JSONL
The JSON Lines text format, also called newline-delimited JSON. JSON Lines is a convenient format for storing structured data that may be processed one record at a time. * JSONL or ndjson)
# upload jsonl file
<- file.path(tempdir(), "iris.json")
irisj
# stream_out function write the JSONL format to a connection
:::stream_out(iris[1:15, ], file(irisj))
jsonlite#> opening file output connection.
#>
! Processed total of 15 rows.
Complete#> closing file output connection.
<- "v1/iris_10.json"
vpath <- File_upload(v, irisj, vpath)
fi2 #> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> uploading file /tmp/RtmpeUkAj5/iris.json...
$path
fi2#> [1] "/v1/iris_10.json"
$mimetype
fi2#> [1] "application/json"
File_query(fi2)
#> fetching the number of rows of the file...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects/2060147433357145394...
#> found lines
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects/2060147433357145394/data...
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 5.1 3.5 1.4 0.2 setosa
#> 2 4.9 3.0 1.4 0.2 setosa
#> 3 4.7 3.2 1.3 0.2 setosa
#> 4 4.6 3.1 1.5 0.2 setosa
#> 5 5.0 3.6 1.4 0.2 setosa
#> 6 5.4 3.9 1.7 0.4 setosa
#> 7 4.6 3.4 1.4 0.3 setosa
#> 8 5.0 3.4 1.5 0.2 setosa
#> 9 4.4 2.9 1.4 0.2 setosa
#> 10 4.9 3.1 1.5 0.1 setosa
#> 11 5.4 3.7 1.5 0.2 setosa
#> 12 4.8 3.4 1.6 0.2 setosa
#> 13 4.8 3.0 1.4 0.1 setosa
#> 14 4.3 3.0 1.1 0.1 setosa
#> 15 5.8 4.0 1.2 0.2 setosa
File_query(fi2, filters = filters("Sepal.Length >= 5"))
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects/2060147433357145394/data...
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 5.1 3.5 1.4 0.2 setosa
#> 2 5.0 3.6 1.4 0.2 setosa
#> 3 5.4 3.9 1.7 0.4 setosa
#> 4 5.0 3.4 1.5 0.2 setosa
#> 5 5.4 3.7 1.5 0.2 setosa
#> 6 5.8 4.0 1.2 0.2 setosa
Files Listing
Works as vaults and folders using the same criteria for filtering and the same fetch_next, fetch_all functions.
all_files <- as.data.frame(Files())
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
Files(limit = 3)
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> EDP List of 3 Objects
#> path object_type vault_name user_name last_modified
#> 1 /ClinGen/ClinGen-1-0-0-2015-06-02-Genes-1098056416333778650-20230215200640.json.gz file Public Data Dev David Caplan 2023-02-15T20:07:46.945Z
#> 2 /ClinVar/5.2.0-20210110/ClinVar-5-2-0-20210110-Variants-GRCH37-1425664822266145048-20221110194518.json.gz file Public David Caplan 2022-11-10T19:47:23.197Z
#> 3 /ClinVar/5.2.0-20210110/ClinVar-5-2-0-20210110-Variants-GRCH38-1425664716292714398-20221110194523.json.gz file Public David Caplan 2022-11-10T19:47:23.188Z
Files(regex = 'TCG*')
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> EDP List of 2 Objects
#> path object_type vault_name user_name last_modified
#> 1 /TCGA/2.0.0-2018-mc3-v0.2.8/TCGA-mc3.v0.2.8.PUBLIC.LAML_PATCH.maf.oncokb.txt.gz file Public David Caplan 2023-03-13T16:22:09.865Z
#> 2 /TCGA/2.0.0-2018-mc3-v0.2.8/TCGA_Patients_Diseases.txt file Public Tatjana Damnjanovic 2023-01-18T12:07:22.391Z
Files(regex = 'TCG*')
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> EDP List of 2 Objects
#> path object_type vault_name user_name last_modified
#> 1 /TCGA/2.0.0-2018-mc3-v0.2.8/TCGA-mc3.v0.2.8.PUBLIC.LAML_PATCH.maf.oncokb.txt.gz file Public David Caplan 2023-03-13T16:22:09.865Z
#> 2 /TCGA/2.0.0-2018-mc3-v0.2.8/TCGA_Patients_Diseases.txt file Public Tatjana Damnjanovic 2023-01-18T12:07:22.391Z
# files from a given vault
Files(vault_id = v)
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> EDP List of 2 Objects
#> path object_type vault_name user_name last_modified
#> 1 /data/iris/v1/iris_10.csv file _iris_upload Karl Forner 2023-06-06T09:55:09.713Z
#> 2 /v1/iris_10.json file _iris_upload Karl Forner 2023-06-06T09:55:11.894Z
# files from a given folder
dirs_df <- suppressMessages(as.data.frame(Folders()))
dplyr::filter(dirs_df, path == '/GENCODE/3.0.0-42')
#> account_id ancestor_object_ids availability class_name created_at dataset_description dataset_documents_count documents_count dataset_full_name storage_class dataset_id depth
#> 1 2 1911311459582295135 NULL Object 2022-11-13T01:24:41.387Z NULL NULL NULL NULL NULL NULL 1
#> description filename full_path has_children has_folder_children id is_deleted is_transformable last_accessed last_modified md5
#> 1 NULL 3.0.0-42 quartzbio:Public:/GENCODE/3.0.0-42 TRUE FALSE 1911311461138976190 FALSE FALSE 2022-11-13T01:24:41.387Z 2022-11-13T01:24:41.387Z NULL
#> metadata mimetype num_children num_descendants object_type parent_object_id path size tags updated_at upload_url url
#> 1 NULL application/vnd.solvebio.folder 4 4 folder 1911311459582295135 /GENCODE/3.0.0-42 NULL NULL 2023-05-23T12:41:09.998Z NULL NULL
#> user user_id vault_id vault_name version_count global_beacon
#> 1 User, david.caplan@precisionformedicine.com, David, David Caplan, 2, TRUE, TRUE, primary_owner, 2 19 Public 0 NULL
fo <- Folder(full_path = 'quartzbio:Public:/GENCODE/3.0.0-42')
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
fo
#> folder "quartzbio:Public:/GENCODE/3.0.0-42" (application/vnd.solvebio.folder) nb:0 user:David Caplan accessed:2022-11-13T01:24:41.387Z
Files(ancestor_id = fo$id)
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> EDP List of 2 Objects
#> path object_type vault_name user_name last_modified
#> 1 /GENCODE/3.0.0-42/gencode.v42.annotation.gtf.gz file Public David Caplan 2022-11-13T02:21:57.723Z
#> 2 /GENCODE/3.0.0-42/gencode.v42lift37.annotation.gtf.gz file Public David Caplan 2022-11-13T02:23:12.714Z
Files Move
# move file from v1/ to v2/
f1 <- Folder_create(vault_id=v, '/data/iris/v2')
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
firis_v2 <- Object_update(firis, parent_object_id = f1$id)
#> PUT https://vsim-dev.api.edp.aws.quartz.bio/v2/objects/2060147415064509835...
firis_v2
#> file "vsim-dev:_iris_upload:/data/iris/v2/iris_10.csv" (text/csv) nb:10 user:Karl Forner accessed:2023-06-06T09:55:11.032Z
Files Querying
Columns, Rows of objects that are tabular can be obtained
# fetch the two first rows
f2r <- File_query(firis_v2, limit=2)
#> fetching the number of rows of the file...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects/2060147415064509835...
#> found 10 lines
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects/2060147415064509835/data...
# fetch the two next ones
fetch_next(f2r)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects/2060147415064509835/data...
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 3 4.7 3.2 1.3 0.2 setosa
#> 2 4 4.6 3.1 1.5 0.2 setosa
# fetch row 8
File_query(firis_v2, limit=1, offset=7)
#> fetching the number of rows of the file...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects/2060147415064509835...
#> found 10 lines
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects/2060147415064509835/data...
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 8 5 3.4 1.5 0.2 setosa
iris[8,]
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 8 5 3.4 1.5 0.2 setosa
# fetch all Setosa
File_query(firis_v2, filters= filters('Species contains "setosa"'))
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects/2060147415064509835/data...
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 1 5.1 3.5 1.4 0.2 setosa
#> 2 2 4.9 3 1.4 0.2 setosa
#> 3 3 4.7 3.2 1.3 0.2 setosa
#> 4 4 4.6 3.1 1.5 0.2 setosa
#> 5 5 5 3.6 1.4 0.2 setosa
#> 6 6 5.4 3.9 1.7 0.4 setosa
#> 7 7 4.6 3.4 1.4 0.3 setosa
#> 8 8 5 3.4 1.5 0.2 setosa
#> 9 9 4.4 2.9 1.4 0.2 setosa
#> 10 10 4.9 3.1 1.5 0.1 setosa
File_query(firis_v2, filters= filters('Sepal.Length = 4.7'))
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects/2060147415064509835/data...
#> data frame with 0 columns and 0 rows
Datasets
Create and Import from a data.frame
iris_2 <- Dataset_create(v, 'iris_2.ds')
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
iris_2$full_path
#> [1] "vsim-dev:_iris_upload:/iris_2.ds"
iris_2$object_type
#> [1] "dataset"
import_res <- Dataset_import(iris_2, df = iris[1:12, ], sync = TRUE)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/dataset_imports...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060147492615808236...
#> waiting for task 2060147492615808236 ("Dataset Import" for "vsim-dev:_iris_upload:/iris_2.ds"), 29 retries left
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060147492615808236...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060147504343542628...
#> waiting for task 2060147504343542628 ("Dataset Commit" for "vsim-dev:_iris_upload:/iris_2.ds"), 28 retries left
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060147504343542628...
#> waiting for task 2060147504343542628 ("Dataset Commit" for "vsim-dev:_iris_upload:/iris_2.ds"), 27 retries left
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060147504343542628...
#> waiting for task 2060147504343542628 ("Dataset Commit" for "vsim-dev:_iris_upload:/iris_2.ds"), 26 retries left
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060147504343542628...
#> waiting for task 2060147504343542628 ("Dataset Commit" for "vsim-dev:_iris_upload:/iris_2.ds"), 25 retries left
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060147504343542628...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks...
iris_2 <- Dataset(vault_id = v, path = iris_2$path)
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
i2 <- Dataset_query(iris_2, limit = 1)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147488315883162/data...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147488315883162/fields...
#> took 0.161000000000001 to process the meta data.
fetch_next(i2)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147488315883162/data...
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 4.9 3 1.4 0.2 setosa
Create and Import from an EDP File object
# prepare the file to upload
MTCARS <- mtcars[1:4, 1:5 ]
# write it in a local file
local_path <- file.path(tempdir(), "mtcars.csv")
write.csv(MTCARS, local_path, row.names = FALSE)
# upload it
fi <- suppressMessages(File_upload(v, local_path, "/a/b/c/mtcars.csv"))
# create the dataset (empty)
ds <- Dataset_create(v, "mtcars.ds")
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
# import the EDP File to the Dataset
suppressMessages(imp <- Dataset_import(ds, file_id = fi, sync =TRUE))
#> Warning in Dataset_import(ds, file_id = fi, sync = TRUE): got timeout while waiting for dataset import task completion: 2060147658965121373
# with meta set to FALSE some reordering may appear in columns
df <- Dataset_query(ds, meta = FALSE, limit = 5000)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147655134713188/data...
df[1:3, ]
#> disp mpg hp drat cyl
#> 1 160 21.0 110 3.90 6
#> 2 160 21.0 110 3.90 6
#> 3 108 22.8 93 3.85 4
df_with_meta <- Dataset_query(ds, meta = TRUE)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147655134713188/data...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147655134713188/fields...
#> took 0.167000000000002 to process the meta data.
df_with_meta[1:3, ]
#> mpg cyl disp hp drat
#> 1 21.0 6 160 110 3.90
#> 2 21.0 6 160 110 3.90
#> 3 22.8 4 108 93 3.85
Create and Import from a record lists
genes_1 <- Dataset_create(v, 'genes.ds',
description = "genes hits",
metadata = list(DEV = TRUE),
tags = list("QBP", "EDP"),
storage_class = "Temporary", capacity = "small")
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
records <- list(
list(gene = "CFTR", importance = 1, sample_count = 2104),
list(gene = "BRCA2", importance = 1, sample_count = 1391),
list(gene = "CLIC2", importance = 5, sample_count = 14)
)
import_res <- Dataset_import(genes_1, records = records, sync = TRUE)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/dataset_imports...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060148482955668768...
#> waiting for task 2060148482955668768 ("Dataset Import" for "vsim-dev:_iris_upload:/genes.ds"), 29 retries left
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060148482955668768...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060148492685464999...
#> waiting for task 2060148492685464999 ("Dataset Commit" for "vsim-dev:_iris_upload:/genes.ds"), 28 retries left
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060148492685464999...
#> waiting for task 2060148492685464999 ("Dataset Commit" for "vsim-dev:_iris_upload:/genes.ds"), 27 retries left
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060148492685464999...
#> waiting for task 2060148492685464999 ("Dataset Commit" for "vsim-dev:_iris_upload:/genes.ds"), 26 retries left
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060148492685464999...
#> waiting for task 2060148492685464999 ("Dataset Commit" for "vsim-dev:_iris_upload:/genes.ds"), 25 retries left
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060148492685464999...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks...
g1 <- Dataset_query(genes_1, limit = 1)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060148478010548922/data...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060148478010548922/fields...
#> took 0.162000000000006 to process the meta data.
g1
#> importance gene sample_count
#> 1 1 CFTR 2104
fetch_all(g1)
#> importance gene sample_count
#> 1 1 CFTR 2104
#> 2 1 BRCA2 1391
#> 3 5 CLIC2 14
Dataset_query(genes_1, filters = filters('sample_count <= 14'))
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060148478010548922/data...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060148478010548922/fields...
#> took 0.157999999999987 to process the meta data.
#> importance gene sample_count
#> 1 5 CLIC2 14
Modify records with python expression
nobs <- Dataset_create(v, 'dna_gurus.ds')
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
authors <- list(
list(name='Francis Crick'),
list(name='James Watson'),
list(name='Rosalind Franklin')
)
# additional firt and last name fields to be created
target_fields <- list(
list(
name="first_name",
description="Adds a first name column based on name column",
data_type="string",
expression="record.name.split(' ')[0]"
),
list(
name="last_name",
description="Adds a last name column based on name column",
data_type="string",
expression="record.name.split(' ')[-1]"
)
)
res <- Dataset_import(nobs, records = authors,
target_fields = target_fields,
sync = TRUE)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/dataset_imports...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060148639020092336...
#> waiting for task 2060148639020092336 ("Dataset Import" for "vsim-dev:_iris_upload:/dna_gurus.ds"), 29 retries left
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060148639020092336...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060148649900332273...
#> waiting for task 2060148649900332273 ("Dataset Commit" for "vsim-dev:_iris_upload:/dna_gurus.ds"), 28 retries left
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060148649900332273...
#> waiting for task 2060148649900332273 ("Dataset Commit" for "vsim-dev:_iris_upload:/dna_gurus.ds"), 27 retries left
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060148649900332273...
#> waiting for task 2060148649900332273 ("Dataset Commit" for "vsim-dev:_iris_upload:/dna_gurus.ds"), 26 retries left
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060148649900332273...
#> waiting for task 2060148649900332273 ("Dataset Commit" for "vsim-dev:_iris_upload:/dna_gurus.ds"), 25 retries left
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks/2060148649900332273...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/tasks...
Dataset_query(nobs)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060148634457461929/data...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060148634457461929/fields...
#> took 0.248999999999995 to process the meta data.
#> first_name last_name name
#> 1 Francis Crick Francis Crick
#> 2 James Watson James Watson
#> 3 Rosalind Franklin Rosalind Franklin
Dataset Query
# fetch the first row
f1r_ds <- Dataset_query(iris_2, limit=1)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147488315883162/data...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147488315883162/fields...
#> took 0.183999999999997 to process the meta data.
fetch_next(f1r_ds)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147488315883162/data...
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 4.9 3 1.4 0.2 setosa
fetch_all(f1r_ds)
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 5.1 3.5 1.4 0.2 setosa
#> 2 4.9 3.0 1.4 0.2 setosa
#> 3 4.7 3.2 1.3 0.2 setosa
#> 4 4.6 3.1 1.5 0.2 setosa
#> 5 5.0 3.6 1.4 0.2 setosa
#> 6 5.4 3.9 1.7 0.4 setosa
#> 7 4.6 3.4 1.4 0.3 setosa
#> 8 5.0 3.4 1.5 0.2 setosa
#> 9 4.4 2.9 1.4 0.2 setosa
#> 10 4.9 3.1 1.5 0.1 setosa
#> 11 5.4 3.7 1.5 0.2 setosa
#> 12 4.8 3.4 1.6 0.2 setosa
# Filters acts on column fields that matches the data.frame import.
Dataset_query(iris_2, filters= filters('Species contains "setosa"'))
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147488315883162/data...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147488315883162/fields...
#> took 0.169000000000011 to process the meta data.
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 5.1 3.5 1.4 0.2 setosa
#> 2 4.9 3.0 1.4 0.2 setosa
#> 3 4.7 3.2 1.3 0.2 setosa
#> 4 4.6 3.1 1.5 0.2 setosa
#> 5 5.0 3.6 1.4 0.2 setosa
#> 6 5.4 3.9 1.7 0.4 setosa
#> 7 4.6 3.4 1.4 0.3 setosa
#> 8 5.0 3.4 1.5 0.2 setosa
#> 9 4.4 2.9 1.4 0.2 setosa
#> 10 4.9 3.1 1.5 0.1 setosa
#> 11 5.4 3.7 1.5 0.2 setosa
#> 12 4.8 3.4 1.6 0.2 setosa
Dataset_query(iris_2, filters= filters('Sepal.Length >= 5.1'))
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147488315883162/data...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147488315883162/fields...
#> took 0.171000000000021 to process the meta data.
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 5.1 3.5 1.4 0.2 setosa
#> 2 5.4 3.9 1.7 0.4 setosa
#> 3 5.4 3.7 1.5 0.2 setosa
Dataset_query(iris_2, filters= filters('(Sepal.Length >= 5.1) OR (Sepal.Width <= 3.0)'))
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147488315883162/data...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147488315883162/fields...
#> took 0.163000000000011 to process the meta data.
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 5.1 3.5 1.4 0.2 setosa
#> 2 4.9 3.0 1.4 0.2 setosa
#> 3 5.4 3.9 1.7 0.4 setosa
#> 4 4.4 2.9 1.4 0.2 setosa
#> 5 5.4 3.7 1.5 0.2 setosa
# Keep fields
Dataset_query(iris_2,
fields = c('Petal.Width', 'Species'),
filters= filters('(Sepal.Length >= 5.1) OR (Sepal.Width <= 3.0)'))
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147488315883162/data...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147488315883162/fields...
#> took 0.168999999999983 to process the meta data.
#> Petal.Width Species
#> 1 0.2 setosa
#> 2 0.2 setosa
#> 3 0.4 setosa
#> 4 0.2 setosa
#> 5 0.2 setosa
# Exclude fields
Dataset_query(iris_2,
exclude_fields = c('Petal.Width', 'Species'),
filters= filters('(Sepal.Length >= 5.1) OR (Sepal.Width <= 3.0)'))
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147488315883162/data...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/2060147488315883162/fields...
#> took 0.222000000000008 to process the meta data.
#> Sepal.Length Sepal.Width Petal.Length _commit _id
#> 1 5.1 3.5 1.4 2060147502872920434 AYiQIeU-RgPV1mL7vTiW
#> 2 4.9 3.0 1.4 2060147502872920434 AYiQIeU-RgPV1mL7vTiX
#> 3 5.4 3.9 1.7 2060147502872920434 AYiQIeU-RgPV1mL7vTib
#> 4 4.4 2.9 1.4 2060147502872920434 AYiQIeU-RgPV1mL7vTie
#> 5 5.4 3.7 1.5 2060147502872920434 AYiQIeU-RgPV1mL7vTig
# orginal data order
ds <- Dataset(full_path = "quartzbio:Public:/TCGA/2.0.0-2018-mc3-v0.2.8/PatientsDiseases")
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/objects...
df <- Dataset_query(ds, limit=2000, meta = TRUE)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/1959470796697098463/data...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/1959470796697098463/fields...
#> took 0.161000000000001 to process the meta data.
head(df)
#> PATIENT_BARCODE SAMPLE_BARCODE DISEASE SUBTYPE
#> 1 TCGA-OR-A5J1 TCGA-OR-A5J1-01 ACC Not_Applicable
#> 2 TCGA-OR-A5J2 TCGA-OR-A5J2-01 ACC Not_Applicable
#> 3 TCGA-OR-A5J3 TCGA-OR-A5J3-01 ACC Not_Applicable
#> 4 TCGA-OR-A5J5 TCGA-OR-A5J5-01 ACC Not_Applicable
#> 5 TCGA-OR-A5J6 TCGA-OR-A5J6-01 ACC Not_Applicable
#> 6 TCGA-OR-A5J7 TCGA-OR-A5J7-01 ACC Not_Applicable
# sort by Fields
fields <- DatasetFields(ds)
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/1959470796697098463/fields...
dfo <- Dataset_query(ds, ordering = c('patient_barcode', 'sample_barcode'), limit=2000, meta = TRUE)
#> POST https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/1959470796697098463/data...
#> GET https://vsim-dev.api.edp.aws.quartz.bio/v2/datasets/1959470796697098463/fields...
#> took 0.164000000000016 to process the meta data.
head(dfo)
#> PATIENT_BARCODE SAMPLE_BARCODE DISEASE SUBTYPE
#> 1 TCGA-02-0047 TCGA-02-0047-01 GBM IDHwt
#> 2 TCGA-02-0055 TCGA-02-0055-01 GBM IDHwt
#> 3 TCGA-02-2483 TCGA-02-2483-01 GBM IDHmut-non-codel
#> 4 TCGA-02-2485 TCGA-02-2485-01 GBM IDHwt
#> 5 TCGA-02-2486 TCGA-02-2486-01 GBM IDHwt
#> 6 TCGA-04-1348 TCGA-04-1348-01 OV Not_Applicable
delete(v)
#> DELETE https://vsim-dev.api.edp.aws.quartz.bio/v2/vaults/2669...
#> Vault "vsim-dev:_iris_upload" ("upload", general, rwa), @ "Karl Forner", updated at:2023-06-06T09:57:58.526Z