Download data from Cassava Base

Author

Delgado Luis Fernando

Data Download from Cassava Base using QBMS Package

This documentation outlines the process of downloading data from Cassava Base through the use of the QBMS (Query Breeding Management System) package. The QBMS package facilitates the interaction with breeding databases such as BMS (Breeding Management System), BreedBase, and GIGWA through standardized BrAPI calls, enabling the efficient retrieval of both phenotypic and genotypic data.

Getting Started:

To begin downloading data from Cassava Base, refer to the attached code snippet. This R package is designed to be user-friendly, catering to the specific needs of those involved in the genetic improvement and analysis of crops.

For more information on how to use the QBMS package, visit the GitHub repository: QBMS on GitHub.

Load packages

This is where we check to see if all the necessary packages are installed on your computer.

## Packages (install)
pkgs <- rownames(installed.packages())

if(!"tidyverse" %in% pkgs) install.packages("tidyverse")
if(!"QBMS" %in% pkgs) install.packages("QBMS")
if(!"here" %in% pkgs) install.packages("here")

The code will detect if certain packages are missing from your package library, and if so, will install them automatically.

Now we need to load the packages:

library(tidyverse)
library(QBMS)
library(here)

Historical Data from Cassava Base

The Cassava program at CIAT has a rich history of conducting various trials. The code snippet provided below offers a straightforward method for downloading this historical data, facilitating easy access to valuable research information.

Conection with Cassava BreedBase server

set_qbms_config("https://cassavabase.org/brapi/v1/calls/",
                path = "", time_out = 300, no_auth = TRUE,
                page_size = 10000,
                engine = "breedbase")

List supported crops in the current bms server

list_crops()

[1] "Cassava"

Select a crop by name

set_crop("Cassava")

List all breeding programs in the selected crop

list_programs()

       programName
1              5CP
2              BTI
3             CARI
4               CH
5             CIAT
6     CIP-genebank
7             CNRA
8          Cornell
9             CSIR
10         Embrapa
11           IDIAF
12            IITA
13  INERA_IITA_DRC
14           ISABU
15             ITC
16           KALRO
17              KU
18          NaCRRI
19           NRCRI
20          Rayong
21           SLARI
22            TARI
23             UAC
24              UH
25 UNILA-Indonesia
26            ZARI

Select the desire breeding program by name

set_program("CIAT")

List all year’s trial in the selected program

trials <- list_trials() %>% pull()
trials

 [1] "Malawi"       "Africa"       "Vietnam_2018" "Vietnam_2019" "Vietnam_2020"
 [6] "Vietnam_2021" "Vietnam_2022" "Vietnam_2023" "Vietnam_2024" "Asia"        
[11] "CIAT_1979"    "CIAT_1980"    "CIAT_1981"    "CIAT_1982"    "CIAT_1983"   
[16] "CIAT_1984"    "CIAT_1985"    "CIAT_1986"    "CIAT_1987"    "CIAT_1988"   
[21] "CIAT_1989"    "CIAT_1990"    "CIAT_1991"    "CIAT_1992"    "CIAT_1993"   
[26] "CIAT_1994"    "CIAT_1995"    "CIAT_1996"    "CIAT_1997"    "CIAT_1998"   
[31] "CIAT_1999"    "CIAT_2000"    "CIAT_2001"    "CIAT_2002"    "CIAT_2003"   
[36] "CIAT_2004"    "CIAT_2005"    "CIAT_2006"    "CIAT_2007"    "CIAT_2008"   
[41] "CIAT_2009"    "CIAT_2010"    "CIAT_2011"    "CIAT_2012"    "CIAT_2013"   
[46] "CIAT_2014"    "CIAT_2015"    "CIAT_2016"    "CIAT_2017"    "CIAT_2018"   
[51] "CIAT_2019"    "CIAT_2020"    "CIAT_2021"    "CIAT_2022"    "CIAT_2023"   
[56] "CIAT_2024"    "CIAT"

Above you can see all the years of the trial that have been carried out by the Cassava program. For the actual example case, we will take the years from 1979 to 2022.

trials <- trials[str_starts(trials, "CIAT")] 
trials

 [1] "CIAT_1979" "CIAT_1980" "CIAT_1981" "CIAT_1982" "CIAT_1983" "CIAT_1984"
 [7] "CIAT_1985" "CIAT_1986" "CIAT_1987" "CIAT_1988" "CIAT_1989" "CIAT_1990"
[13] "CIAT_1991" "CIAT_1992" "CIAT_1993" "CIAT_1994" "CIAT_1995" "CIAT_1996"
[19] "CIAT_1997" "CIAT_1998" "CIAT_1999" "CIAT_2000" "CIAT_2001" "CIAT_2002"
[25] "CIAT_2003" "CIAT_2004" "CIAT_2005" "CIAT_2006" "CIAT_2007" "CIAT_2008"
[31] "CIAT_2009" "CIAT_2010" "CIAT_2011" "CIAT_2012" "CIAT_2013" "CIAT_2014"
[37] "CIAT_2015" "CIAT_2016" "CIAT_2017" "CIAT_2018" "CIAT_2019" "CIAT_2020"
[43] "CIAT_2021" "CIAT_2022" "CIAT_2023" "CIAT_2024" "CIAT"

trials <- trials[-c(45:47)]
trials

 [1] "CIAT_1979" "CIAT_1980" "CIAT_1981" "CIAT_1982" "CIAT_1983" "CIAT_1984"
 [7] "CIAT_1985" "CIAT_1986" "CIAT_1987" "CIAT_1988" "CIAT_1989" "CIAT_1990"
[13] "CIAT_1991" "CIAT_1992" "CIAT_1993" "CIAT_1994" "CIAT_1995" "CIAT_1996"
[19] "CIAT_1997" "CIAT_1998" "CIAT_1999" "CIAT_2000" "CIAT_2001" "CIAT_2002"
[25] "CIAT_2003" "CIAT_2004" "CIAT_2005" "CIAT_2006" "CIAT_2007" "CIAT_2008"
[31] "CIAT_2009" "CIAT_2010" "CIAT_2011" "CIAT_2012" "CIAT_2013" "CIAT_2014"
[37] "CIAT_2015" "CIAT_2016" "CIAT_2017" "CIAT_2018" "CIAT_2019" "CIAT_2020"
[43] "CIAT_2021" "CIAT_2022"

Downloading process

The download process may take a short or long time depending on the number of trials to be downloaded.

# Función para procesar cada trial
process_trial <- function(trial) {
  set_trial(trial)
  ontology <- get_trial_obs_ontology()
  STUDIES <- list_studies()
  complete_studies <- STUDIES %>% pull(studyName)
  
  # Utilizar map para iterar sobre complete_studies y obtener datos de cada estudio
  study_data <- map(set_names(complete_studies), ~ {
    set_study(.x)
    get_study_data()
  })
  
  return(study_data)
}

raw_data <- map(trials, process_trial)

# flatten list
raw_data <- flatten(raw_data)

Convert list into a data.frame

all_raw = data.table::rbindlist(raw_data, fill = TRUE) %>% 
  as_tibble() %>%
  filter(observationLevel == "plot")

# dimentions of the entire data set
dim(all_raw)

Save the file in .RDS file

trial_interest = "hist_raw_data_"

meta_file_name.r = paste("01_", trial_interest,
                         Sys.Date(),".rds", sep = "")
saveRDS(all_raw, file = here::here("data", meta_file_name.r))