1 Univ. Grenoble Alpes, CEA, CNRS, INRAE, IRIG, Laboratoire de Physiologie Cellulaire & Végétale, 38000 Grenoble.
2 Univ. Grenoble-Alpes, CNRS, Jardin du Lautaret, 38000, Grenoble, France.
3 Univ. Grenoble-Alpes, Univ. Savoie Mont Blanc, CNRS, LECA, 38000, Grenoble, France.
@ Correspondence: Eric Maréchal <eric.marechal@cea.fr>, Eric Coissac <eric.coissac@metabarcoding.org>
Go back to the global description of the project
- `ROBITools` package is used to read result files produced by OBITools.
- `ROBITaxonomy` package provides function allowing to query OBITools formated taxonomy.
if (!require(ROBITools)) {
# ROBITools are not available on CRAN and have to be installed
# from http://git.metabarcoding.org using devtools
if (!require(devtools)) {
install.packages("devtools")
}
devtools::install_git("https://git.metabarcoding.org/obitools/ROBIUtils.git")
devtools::install_git("https://git.metabarcoding.org/obitools/ROBITaxonomy.git")
devtools::install_git("https://git.metabarcoding.org/obitools/ROBITools.git")
library(ROBITools)
}
## Loading required package: ROBITools
## ROBITools package
library(ROBITaxonomy)
##
## Attaching package: 'ROBITaxonomy'
## The following object is masked from 'package:stats':
##
## family
tidyverse
provides various method for efficient data manipulation and plotting via ggplot2
if (!require(tidyverse)) {
install.packages("tidyverse")
library(tidyverse)
}
## Loading required package: tidyverse
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.1.0 ✓ dplyr 1.0.5
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
vegan
is loaded for its decostand
functionif (!require(vegan)) {
install.packages("vegan")
library(vegan)
}
## Loading required package: vegan
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.5-7
##
## Attaching package: 'vegan'
## The following object is masked from 'package:ROBITools':
##
## rarefy
ade4
provides multivariate analysis functionsif (!require(ade4)) {
install.packages("ade4")
library(ade4)
}
## Loading required package: ade4
ncbi <- read.taxonomy("embl-140/ncbi20190930")
chlo02_raw <- import.metabarcoding.data("chlo02/chlo02.ecotag.tab")
chlo02_raw.clean <- extracts.obiclean(chlo02_raw)
rm(chlo02_raw)
sample_names <- rownames(chlo02_raw.clean)
positive_control <- grep(pattern = "^CP[0-9]{2}_", x = sample_names)
negative_control <- grep(pattern = "^C[0-9]+_", x = sample_names)
blanks <- grep(pattern = "^BLK[0-9]{2}_", x = sample_names)
category <- factor(rep("sample", nrow(chlo02_raw.clean)),
levels = c("sample", "positive_ctrl", "negative_ctrl", "blank")
)
category[positive_control] <- "positive_ctrl"
category[negative_control] <- "negative_ctrl"
category[blanks] <- "blank"
chlo02_raw.clean@samples$category <- category
sampleid <- sapply(sample_names, function(x) strsplit(x, split = "_")[[1]][1])
chlo02_raw.clean@samples$sampleid <- factor(sampleid)
site <- factor(category, levels = c(
levels(category),
"ANT", "CHA", "LOR", "RIS", "VCH"
))
for (s in c("ANT", "CHA", "LOR", "RIS", "VCH")) {
p <- grep(pattern = paste0("^", s, "[0-9]+_"), x = sample_names)
site[p] <- s
}
chlo02_raw.clean@samples$site <- site
Plots for every PCR its diversity as a function of read counts
chlo02_raw.clean %>% plot_reads_x_motus(q = 1, threshold = 0.00, limit = 500) +
geom_hline(yintercept = 4)
# Sets the count of not true sequences to 0.
chlo02_raw.center <- chlo02_raw.clean
chlo02_raw.center@reads[chlo02_raw.clean$obiclean_status != "h"] <- 0
# Removes MOTUs occurring in any samples
chlo02_raw.center <- chlo02_raw.center[, colSums(chlo02_raw.center@reads) > 0]
# Removes empty samples
chlo02_raw.center <- chlo02_raw.center[which(rowSums(chlo02_raw.center@reads) > 0), ]
# Erases the original large data set
rm(chlo02_raw.clean)
The Not annoted
category corresponds to MOTU without similarity with the reference database. The ’Others" category corresponds to non Chlorophyceae MOTUs.
motus_cat <- rep("Others", ncol(chlo02_raw.center))
chlorophita.taxid <- ecofind(ncbi, "^chlorophyceae$")
is_chlorophyceae <- is.subcladeof(ncbi, chlo02_raw.center@motus$taxid, chlorophita.taxid)
motus_cat[is_chlorophyceae] <- "Chlorophyceae"
motus_cat[chlo02_raw.center@motus$taxid == 1] <- "Not annoted"
chlo02_raw.center@motus$taxo_group <- factor(motus_cat,
levels = c("Chlorophyceae", "Others", "Not annoted")
)
Plot for each PCR then hill’s number diversity for \(q=1\) as a function of read numbers associated to that PCR.
The theoritical Hill’s number for positive controle is \(^1D=4\).
chlo02_raw.center %>% plot_reads_x_motus(q = 1, threshold = 0.00, limit = 500) +
geom_hline(yintercept = 4)
The table resumes for each sequence occurring in positive controls how many times they occure in samples, positive controles and negative controles
chlo02_in_positive_ctrls <- which(
colSums(
chlo02_raw.center@reads[
chlo02_raw.center@samples$category == "positive_ctrl",
]
) > 0
)
data.frame(
sample = colSums(chlo02_raw.center@reads[chlo02_raw.center@samples$category == "sample", chlo02_in_positive_ctrls]),
positive = colSums(chlo02_raw.center@reads[
chlo02_raw.center@samples$category == "positive_ctrl",
chlo02_in_positive_ctrls
]),
negative = colSums(chlo02_raw.center@reads[
chlo02_raw.center@samples$category == "negative_ctrl",
chlo02_in_positive_ctrls
])
)
cont <- "HISEQ:290:CCU08ANXX:1:1101:6513:2094_CONS_SUB_SUB_CMP"
data.frame(
motus = chlo02_raw.center@reads[, cont],
site = chlo02_raw.center@samples$site
) %>%
ggplot(aes(x = site, y = motus)) +
geom_boxplot() +
scale_y_sqrt()
tapply(chlo02_raw.center@reads[, cont], chlo02_raw.center@samples$site, max, na.rm = TRUE)
## sample positive_ctrl negative_ctrl blank ANT
## 40117 219840 0 10 5
## CHA LOR RIS VCH
## 12 0 9 8
cont <- "HISEQ:290:CCU08ANXX:1:1101:20062:4228_CONS_SUB_SUB_CMP"
data.frame(
motus = chlo02_raw.center@reads[, cont],
site = chlo02_raw.center@samples$site
) %>%
ggplot(aes(x = site, y = motus)) +
geom_boxplot() +
scale_y_sqrt()
tapply(chlo02_raw.center@reads[, cont], chlo02_raw.center@samples$site, max, na.rm = TRUE)
## sample positive_ctrl negative_ctrl blank ANT
## 160 1012 0 0 0
## CHA LOR RIS VCH
## 0 0 0 0
chlo02_in_negative_ctrls <- which(
colSums(
chlo02_raw.center@reads[
chlo02_raw.center@samples$category == "negative_ctrl",
]
) > 0
)
maxneg <- aggregate(chlo02_raw.center[, chlo02_in_negative_ctrls],
MARGIN = 1,
by = list(category = chlo02_raw.center@samples$category),
FUN = max
)
neg_ctrls <- data.frame(
sample = colSums(chlo02_raw.center@reads[
chlo02_raw.center@samples$category == "sample",
chlo02_in_negative_ctrls
]),
positive = colSums(chlo02_raw.center@reads[
chlo02_raw.center@samples$category == "positive_ctrl",
chlo02_in_negative_ctrls
]),
negative = colSums(chlo02_raw.center@reads[
chlo02_raw.center@samples$category == "negative_ctrl",
chlo02_in_negative_ctrls
]),
max_in = levels(maxneg@samples$category)[apply(maxneg@reads, 2, which.max)],
idx = chlo02_in_negative_ctrls
)
neg_ctrls
neg_ctrls %>% ggplot(aes(x = sample, y = negative, col = max_in)) +
geom_point() +
scale_x_log10() +
scale_y_log10() +
geom_abline(slope = 1, intercept = 0, col = "red")
## Warning: Transformation introduced infinite values in continuous x-axis
Each MOTU more abundant in the negative controls than in the samples is removed.
chlo02_raw.center <- chlo02_raw.center[, -(neg_ctrls[neg_ctrls$max_in == "negative_ctrl", "idx"])]
Clean the dataset by removing every empty PCR
chlo02_raw.center <- chlo02_raw.center[which(rowSums(chlo02_raw.center@reads) > 0), ]
dim(chlo02_raw.center)
## [1] 529 8608
Counts how many PCR replicates are conserved at that level of the analysis.
n_pcr <- table(chlo02_raw.center@samples$sampleid)[as.character(chlo02_raw.center@samples$sampleid)]
table(n_pcr, chlo02_raw.center@samples$site)
##
## n_pcr sample positive_ctrl negative_ctrl blank ANT CHA LOR RIS VCH
## 1 0 0 2 1 1 0 1 0 1
## 2 9 0 2 0 24 14 22 27 8
## 3 35 64 0 0 69 84 66 55 44
The sample column is equal to zero because sample category is actually distributed among sites At that level every blanks are empty and no more PCRs corresponding to them are conserved
Plots for every PCR its diversity as a function of read counts
chlo02_raw.center %>% plot_reads_x_motus(q = 1, threshold = 0.00, limit = 200) + geom_hline(yintercept = 4)
Keeps only PCRs with more than 200 reads. MOTUs having no more representants are removed
chlo02_raw.center <- chlo02_raw.center[rowSums(chlo02_raw.center@reads) >= 200, ]
chlo02_raw.center <- chlo02_raw.center[, colSums(chlo02_raw.center@reads) > 0]
chlo02.keep1 <- tag_bad_pcr(chlo02_raw.center@samples$sampleid,
counts = chlo02_raw.center@reads
)
The dissimilarity distribution of PCR replicates is bimodal. Low dissimilarity replicates correspond to positive controls, the second mode corresponds to samples.
chlo02_by_taxo_group <- aggregate(chlo02_raw.center, MARGIN = 2, by = list(chlo02_raw.center@motus$taxo_group), FUN = sum)
chlo02_raw.center@samples$chlorophyceae_part <- chlo02_by_taxo_group@reads[, 1] / rowSums(chlo02_by_taxo_group@reads)
chlo02_raw.center@samples$dist_barycenter <- chlo02.keep1$distance
The lower replicability of the samples is be explained by a very low amount of DNA as shown in the following plot.
data <- data.frame(
x = chlo02.keep1$distance,
y = 1 - chlo02_by_taxo_group@reads[, 1] / rowSums(chlo02_by_taxo_group@reads),
col = chlo02_raw.center@samples$category,
pch = as.character(table(chlo02_raw.center@samples$sampleid)[as.character(chlo02_raw.center@samples$sampleid)]),
ok = table(chlo02_raw.center@samples$sampleid)[as.character(chlo02_raw.center@samples$sampleid)] > 1 &
chlo02_raw.center@samples$category == "sample"
)
ggplot(data[table(chlo02_raw.center@samples$sampleid)[as.character(chlo02_raw.center@samples$sampleid)] > 1, ], aes(x = y, y = x, col = col)) +
geom_point() +
scale_y_log10() +
ylab("Distance among PCR replicates") +
xlab("Fraction of non Chlorophyceae reads") +
stat_smooth(data = data[data$ok, ], col = "black", method = "lm", se = FALSE) +
theme(legend.title = element_blank())
## `geom_smooth()` using formula 'y ~ x'
Estimates the linear model for the sample PCRs.
ok <- table(chlo02_raw.center@samples$sampleid)[as.character(chlo02_raw.center@samples$sampleid)] > 1 &
chlo02_raw.center@samples$category == "sample" & data$x > 0 & data$y > 0
l <- lm(log(data$x[ok]) ~ data$y[ok], na.action = na.omit)
summary(l)
##
## Call:
## lm(formula = log(data$x[ok]) ~ data$y[ok], na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.40960 -0.18667 0.01735 0.17777 1.20383
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.6857 0.1034 -25.97 <2e-16 ***
## data$y[ok] 2.1402 0.1109 19.30 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2995 on 318 degrees of freedom
## Multiple R-squared: 0.5396, Adjusted R-squared: 0.5381
## F-statistic: 372.6 on 1 and 318 DF, p-value: < 2.2e-16
cor(log(data$x[ok]), data$y[ok], use = "complete.obs")^2
## [1] 0.5395574
More Chlorophyceae reads in the PCR closer are the replicates. The Chlorophyceae amount of DNA seems to be the limitant factor.
We now just focus on the target group and remove every control PCRs
chlo02.chlorophyceae <- chlo02_raw.center[, chlo02_raw.center@motus$taxo_group == "Chlorophyceae"]
chlo02.chlorophyceae <- chlo02.chlorophyceae[which(rowSums(chlo02.chlorophyceae@reads) > 0), ]
chlo02.chlorophyceae <- chlo02.chlorophyceae[which(chlo02.chlorophyceae@samples$category == "sample"), ]
chlo02.chlorophyceae <- chlo02.chlorophyceae[, which(colSums(chlo02.chlorophyceae@reads) > 0)]
and we merge the sites metadata
orchamp_env <- read.csv("orchamp_env.csv", header = TRUE)
sample_descr <- read.csv2("labels_metabar_trie.csv", header = TRUE)
code_plot <- paste(sample_descr$Site_short, sample_descr$Altitude, sample_descr$Horizon, sep = "_")
sample_descr$Code_plot <- code_plot
left_join(sample_descr, orchamp_env) %>% right_join(chlo02.chlorophyceae@samples, by = c(EXTRACTION.CODE = "sampleid")) -> samples
## Joining, by = c("Altitude", "Horizon", "Code_plot")
rownames(samples) <- as.character(samples$sample)
chlo02.chlorophyceae@samples <- samples
chlo02.chlorophyceae@motus$class <- taxonatrank(ncbi, chlo02.chlorophyceae@motus$taxid, "class", name = TRUE)
table(table(chlo02.chlorophyceae@samples$EXTRACTION.CODE)[as.character(chlo02.chlorophyceae@samples$EXTRACTION.CODE)], chlo02.chlorophyceae@samples$Site_short)
##
## ANT CHA LOR RIS VCH
## 1 11 2 9 7 10
## 2 8 16 2 22 10
## 3 6 45 6 15 18
chlo02.chlorophyceae %>% plot_reads_x_motus(q = 1, threshold = 0.00) + geom_hline(yintercept = 4)
dir.create("cleaned_datasets", showWarnings = FALSE)
write.csv2(chlo02.chlorophyceae@reads, file = "cleaned_datasets/Chlo02.cleaned.reads.csv")
write.csv2(chlo02.chlorophyceae@motus, file = "cleaned_datasets/Chlo02.cleaned.motus.csv")
write.csv2(chlo02.chlorophyceae@samples, file = "cleaned_datasets/Chlo02.cleaned.samples.csv")