source("src/itgr_measurements.R")
source("src/itgr_samples_info.R")
Injections des données
Chargements des scripts
Création de la base de données
<- DBI::dbConnect(RSQLite::SQLite(), "./contaminants-rlavoie-eccc.sqlite")
con <- strsplit(paste(readLines("sql/db_create_ddl.sql"), collapse = "\n"), ";\n")[[1]]
db_ddl_sql ::walk(db_ddl_sql, \(x) DBI::dbExecute(con, x)) purrr
Étape d’injections des données
Afin de respecter l’intégrité référentielle des données, l’injection des données doit se faire dans un certain ordre. Il faut par exemple renseigner en premier l’ensemble des sites avant de pouvoir déclarer les échantillons associés.
Voici l’ordre d’injection attendu au regard de la structure de la base de données.
- Sites
- Species
- Projects
- Field samples
- Lab samples
- Analyte
- Lab measurement
L’ensemble des données destinés à populer les tables a été consolider lors des étapes précédentes. Nous allons maintenant procéder à leur importation dans la base de données.
Importation des données
Table sites
<- readRDS("data/tbl_sites.rds") sites
On tranforme les intitulées de colonnes du data.frame pour qu’ils correspondent à ceux de la table de données SQL.
<- sites |>
data_sites ::select(
dplyrid_site = final_id,
province,
lat,
lon,
srid
)
::dbWriteTable(con, "sites", data_sites, append = TRUE) DBI
Table species
<- readRDS("data/tbl_species.rds")
data_species ::dbWriteTable(con, "species", data_species, append = TRUE) DBI
Table field_sample
<- readRDS("data/tbl_sites.rds") |>
map_sites ::separate_rows(original_ids, sep = ";") |>
tidyr::select(original_ids, final_id)
dplyr
options(dplyr.summarise.inform = FALSE)
<- itgr_samples_info() |>
samples ::select(-received_date, -id_project, -source) |>
dplyr::mutate(collection_date = as.character(collection_date)) |>
dplyr# Ajout des identifiants de site consolidé
::left_join(map_sites, by = c("id_site" = "original_ids")) |>
dplyr::select(-id_site) |>
dplyr::rename(id_site = final_id) |>
dplyr::mutate_if(is.character, stringr::str_trim) |>
dplyr::mutate_if(is.character, ~ifelse(.x == "N/A", NA, .x)) |>
dplyr::mutate_if(is.character, ~ifelse(.x == "NA", NA, .x)) |>
dplyr::distinct() |>
dplyr# Nettoyage des duplicates pour les champs age et tissus
::group_by(
dplyr::across(c(-age, -tissue))
dplyr|> dplyr::summarise(
) age = paste0(na.omit(age), collapse = ";"),
tissue = paste0(na.omit(tissue), collapse = ";")
|> dplyr::ungroup() |>
) ::mutate(
dplyrage = dplyr::na_if(age, ""),
tissue = dplyr::na_if(tissue, "")
|>
) # Nettoyage des sample IDs avec une dernière lettre
::mutate(id_lab_sample = stringr::str_replace(id_lab_sample, "\\s\\w+$", "")) |>
dplyr::group_by(
dplyr::across(c(-id_lab_sample, -id_source_report))
dplyr|> dplyr::summarise(
) id_lab_sample = paste0(na.omit(id_lab_sample), collapse = ";"),
id_source_report = paste0(na.omit(id_source_report), collapse = ";")
|> dplyr::ungroup() |>
) ::distinct() |>
dplyr::group_split(id_field_sample) |>
dplyr# Création d'un identifiant unique d'échantillonnage en se basant sur la date de collection
# L'identifiant de certains échantillons n'est pas unique
::map(\(df){
purrrif(nrow(df) > 1){
<- df |>
df ::mutate(id_field_sample = paste0(
dplyr"-",
id_field_sample, ::str_replace_all(collection_date, "[-]", "")
stringr
))
}return(df)
|> dplyr::bind_rows() |>
}) ::group_by(id_field_sample) |>
dplyr::mutate(id_field_sample = paste0(id_field_sample, "-", dplyr::row_number())) |>
dplyr::ungroup()
dplyr
# Inject field samples
::dbWriteTable(con, "field_sample", dplyr::select(samples, -id_lab_sample, -id_source_report) |>
DBI::distinct(), append = TRUE) dplyr
Table project
<- itgr_samples_info()[,c("id_project", "id_source_report")] |> dplyr::distinct()
project_samples <- readRDS("data/tbl_projects_reports.rds")
projects_reports
<- dplyr::select(projects_reports, id_project = project_id,
projects title = project_title, investigator = project_leader) |>
::mutate(organization = "ECCC", data_manager = NA, email_investigator = NA, email_data_manager = NA, description = NA) |>
dplyr::group_by(id_project) |>
dplyr::summarise(title = paste0(na.omit(unique(title)), collapse = ";"), investigator = paste0(na.omit(unique(investigator)), collapse = ";")) |>
dplyr::mutate(title = dplyr::na_if(title, ""), investigator = dplyr::na_if(investigator, "")) |>
dplyr::distinct()
dplyr
::dbWriteTable(con, "project", projects, append = TRUE) DBI
Table report
<- projects_reports |>
reports ::select(id_project = project_id, id_report = report_id, report_date, report_access_path) |>
dplyr::distinct()
dplyr
::dbWriteTable(con, "report", reports, append = TRUE) DBI
Table lab_sample
<- itgr_measurements() |>
lab_sample ::select(id_lab_sample = SampleID) |>
dplyr::distinct() dplyr
::dbWriteTable(con, "lab_sample", lab_sample, append = TRUE) DBI
Table lab_field_sample
<- samples |>
lab_field_samples ::select(id_lab_sample, id_field_sample, -id_source_report) |>
dplyr::separate_longer_delim(id_lab_sample, delim = ";") |>
tidyr::distinct()
dplyr
## Add missing lab_sample
<- DBI::dbGetQuery(con, "SELECT DISTINCT id_lab_sample FROM lab_sample;")
present_lab_sample_id
<- dplyr::select(lab_field_samples, id_lab_sample) |>
missing_lab_sample ::distinct() |>
dplyr::filter(!(id_lab_sample %in% present_lab_sample_id$id_lab_sample))
dplyr
::dbWriteTable(con, "lab_sample", missing_lab_sample, append = TRUE) DBI
::dbWriteTable(con, "lab_field_sample", lab_field_samples, append = TRUE) DBI
Table analytes
<- readRDS("data/tbl_analytes.rds") |>
analytes ::select(
dplyrid_analyte = final_id,
name,
other_name,
short_name,unit = Units,
family = conpound_family,
casid,
pubcid,
is_dry_weight,
on_isolated_lipid,note_analyte = notes
)
::dbWriteTable(con, "analyte", analytes, append = TRUE) DBI
Table lab_measurements
<- readRDS("data/tbl_analytes.rds") |>
analytes_ref ::separate_rows(original_ids, sep = ";") |>
tidyr::select(original_ids, final_id) |>
dplyr::distinct()
dplyr
<- itgr_measurements() |>
measurements ::mutate(
dplyrkey = stringr::str_replace(variable, "PCB-|PCB ", "PCB") |>
::make_clean_names(allow_dupes = TRUE, case = "none") |> tolower()
janitor|> dplyr::left_join(analytes_ref, by = c("key" = "original_ids")) |>
) ::filter(conpound_family != "SImean")
dplyr
<- dplyr::filter(measurements, final_id %in% c("plipid", "pmoisture")) |>
percent_moisture_lipid ::select(-variable, -key) |>
dplyr::pivot_wider(names_from = "final_id", values_from = "value")
tidyr
<- dplyr::filter(measurements, !final_id %in% c("plipid", "pmoisture")) |>
measurements ::left_join(percent_moisture_lipid) |>
dplyr::filter(!is.na(value) & value != "" & value != "NA" & value != "NDR")
dplyr
<- measurements |> dplyr::select(
measurements id_analyte = final_id,
id_lab_sample = SampleID,
value,percent_lipid = plipid,
percent_moisture = pmoisture
|>
) ::mutate(id_lab_sample = stringr::str_replace(id_lab_sample, "\\s\\w+$", "")) |>
dplyr::mutate(id_lab_sample = stringr::str_replace(id_lab_sample, "[*]", ""))
dplyr
<- measurements |>
measurements ::mutate(is_censored = toxbox::detect_cens(value)) |>
dplyr::mutate(value = toxbox::remove_cens(value)) |>
dplyr::filter(!is.na(value)) dplyr
::dbWriteTable(con, "lab_measurement", measurements, append = TRUE) DBI