Load all the packages
library("ranacapa")
library("phyloseq")
library("ggplot2")
library("stringr")
library("plyr")
library("reshape2")
library("reshape")
library("dplyr")
library("tidyr")
library("doBy")
library("plyr")
library("microbiome")
library("ggpubr")
library("vegan")
library("tidyverse")
library("magrittr")
library("cowplot")
library("dendextend")
library("WGCNA")
library("metagenomeSeq")
library("decontam")
library("RColorBrewer")
library("ampvis2")
library("ggpubr")
library("formatR")
library("DT")
library("taxcleanr") #devtools::install_github("shashank-KU/taxcleanr")
library("pheatmap")
library("DESeq2")
library("microbiomeMarker")
A total of 30 salmon were sampled at the first time point (T0), followed by the sampling of 120 salmon individuals (30 for each diet) at each subsequent time point (T1 – 5 weeks after the trial start; T2 – 10 weeks after the trial start; T3 – 15 weeks after the trial start). The distal gut section was dissected using sterile scalpels and tweezers, and approximately 100 mg of gut content was collected from each fish using inoculation loops.
annotation_colors <- list(
Timepoint = c(`T0`="gray" ,`T1` = "#FF7F00", `T2` = "#FFD92F", `T3`="#F781BF"),
Diet = c(`ext-ctrl`="#FDC086", `CTR`="#e41a1cff", `MC1`="#377EB8", `MC2`="#4DAF4A", `MN3`="#984EA3"))
# Specify the path to your CSV file on GitHub
csv_file <- "Phenotypic/Metrics_low_dosage.csv"
csv_url <- paste0("https://raw.githubusercontent.com/shashank-KU/ImprovaFish-MDF-Effects/main/", csv_file)
# Read the CSV file
fish_data <- read.csv(url(csv_url), row.names = 1)
fish_data$Diet <- factor(fish_data$Diet, levels=c('start', 'CTR', 'MC1', 'MC2', 'MN3'))
plot2 <- ggboxplot(fish_data, x = "Diet", y = "BW",
color = "black", palette = "jco", legend = "none", outlier.shape = NA)+
#stat_compare_means(comparisons = my_comparisons, hide.ns = F) +
#stat_compare_means() +
ggplot2::geom_jitter(
mapping = aes_string(fill="Diet"),
position = ggplot2::position_jitter(seed=123),
shape = 21,
color = "black",
size = 2,
alpha = 0.8
) +
theme_bw() + ylab("Body weigth") +
theme(legend.position="none",axis.title.x=element_blank()) +
scale_fill_manual(values = annotation_colors$Diet) +
scale_colour_manual( values = annotation_colors$Diet) +
facet_grid(. ~ Timepoint, scales="free_x", space = "free_x")
# fish_data %>%
# complete(Timepoint, Diet, fill = list(BW = NA, Length = NA)) %>%
# group_by(Timepoint, Diet) %>%
# summarise(across(c(Heart_Score, Fat_Score,cataract_V,cataract_H ), mean, na.rm = TRUE), .groups = 'drop') %>%
# mutate(across(where(is.numeric), round, 2)) %>%
# DT::datatable()
library(dplyr)
library(tidyr)
fish_data %>%
complete(Timepoint, Diet, fill = list(BW = NA, Length = NA, GW = NA, CF = NA, HSI = NA, CSI = NA)) %>%
group_by(Timepoint, Diet) %>%
summarise(
Mean_BW = mean(BW, na.rm = TRUE),
Mean_Length = mean(Length, na.rm = TRUE),
Mean_GW = mean(GW, na.rm = TRUE),
Mean_CF = mean(CF, na.rm = TRUE),
Mean_HSI = mean(HSI, na.rm = TRUE),
Mean_CSI = mean(CSI, na.rm = TRUE),
Mean_Heart_W = mean(Heart_W, na.rm = TRUE),
Mean_Liver_W = mean(Liver_W, na.rm = TRUE),
SEM_BW = sd(BW, na.rm = TRUE) / sqrt(sum(!is.na(BW))),
SEM_Length = sd(Length, na.rm = TRUE) / sqrt(sum(!is.na(Length))),
SEM_GW = sd(GW, na.rm = TRUE) / sqrt(sum(!is.na(GW))),
SEM_CF = sd(CF, na.rm = TRUE) / sqrt(sum(!is.na(CF))),
SEM_HSI = sd(HSI, na.rm = TRUE) / sqrt(sum(!is.na(HSI))),
SEM_CSI = sd(CSI, na.rm = TRUE) / sqrt(sum(!is.na(CSI))),
SEM_Heart_W = sd(Heart_W, na.rm = TRUE) / sqrt(sum(!is.na(Heart_W))),
SEM_Liver_W = sd(Liver_W, na.rm = TRUE) / sqrt(sum(!is.na(Liver_W))),
.groups = 'drop'
) %>%
mutate(
across(where(is.numeric), ~round(., 2)),
across(starts_with("Mean_"), ~paste(., "±", get(paste0("SEM_", sub("Mean_", "", cur_column())))))
) %>%
filter(complete.cases(.)) %>%
select(Timepoint, Diet, starts_with("Mean_"), -starts_with("SEM_")) %>%
rename_with(~sub("Mean_", "", .), starts_with("Mean_")) %>%
t() %>%
DT::datatable()
Import data and clean the taxonomy
base_url <- "https://raw.githubusercontent.com/shashank-KU/ImprovaFish-MDF-Effects/main/"
# Specify the path to your RDS file on GitHub
rds_file <- "Metagenomics/low_dosage_trial/metagenomics_low_MDF.rdata"
rds_url <- paste0(base_url, rds_file)
# Read the RDS file
all <- readRDS(url(rds_url))
tax <- data.frame(tax_table(all), stringsAsFactors = FALSE)
tax <- tax[,1:7] # No info in col 8-15
# Set informative colnames
colnames(tax) <- c("Kingdom", "Phylum","Class","Order","Family","Genus", "Species")
tax.clean <- data.frame(row.names = row.names(tax),
Kingdom = str_replace(tax[,1], "d__",""),
Phylum = str_replace(tax[,2], "p__",""),
Class = str_replace(tax[,3], "c__",""),
Order = str_replace(tax[,4], "o__",""),
Family = str_replace(tax[,5], "f__",""),
Genus = str_replace(tax[,6], "g__",""),
Species = str_replace(tax[,7], "s__",""),
stringsAsFactors = FALSE)
tax.clean[is.na(tax.clean)] <- ""
# - Clean rank by rank
# Kingdom - Remove the unassigned completely
# Phylum
# Class
# Remove extra info about origin from some bacteria
# Remove all fields that contain "uncultured", "Unknown" or "Ambigious"
bad <- c("Ambiguous_taxa","uncultured", "Subgroup_21")
tax.clean[tax.clean$Class %in% bad,3:7] <- ""
# Order
bad <- c("0319-6G20","1-20","11-24", "ADurb.Bin180","D8A-2", "Group_1.1c", "JGI_0000069-P22","Marine_Group_II",
"Pla3_lineage","Run-SP154",
"Ambiguous_taxa", "uncultured", "UBA10353_marine_group",
"Subgroup_17", "SAR86_clade", "SAR11_clade", "SAR202_clade")
tax.clean[tax.clean$Order %in% bad,4:7] <- ""
# Family
bad <- c("Ambiguous_taxa","11-24","67-14", "uncultured", "SAR116_clade", "Run-SP154",
"Marine_Group_II", "env.OPS_17", "SAR116_clade", "S085", "S-70", "NS9_marine_group", "Mitochondria")
tax.clean[tax.clean$Family %in% bad,5:7] <- ""
# Genus
bad <- c("Ambiguous_taxa","Unknown_Family","uncultured","Subgroup_10", "1174-901-12", "67-14")
tax.clean[tax.clean$Genus %in% bad,6:7] <- ""
# Species
bad <- c("Ambiguous_taxa","marine_metagenome","low_GC","wastewater_metagenome","unidentified",
"uncultured_synthetic", "uncultured_organism")
tax.clean[tax.clean$Species %in% bad,6:7] <- ""
#tax.clean[grepl("uncultured", tax.clean$Species),"Species"] <- ""
#tax.clean[grepl("unidentified", tax.clean$Species),"Species"] <- ""
# Remove remove ".", change "-" and " " to "_"
for (i in 1:ncol(tax.clean)){
tax.clean[,i] <- str_replace_all(tax.clean[,i], "[.]","")
tax.clean[,i] <- str_replace_all(tax.clean[,i], "[(]","")
tax.clean[,i] <- str_replace_all(tax.clean[,i], "[)]","")
tax.clean[,i] <- str_replace_all(tax.clean[,i], "-","_")
tax.clean[,i] <- str_replace_all(tax.clean[,i], " ","_")
}
for (i in 1:7){ tax.clean[,i] <- as.character(tax.clean[,i])}
# File holes in the tax table
for (i in 1:nrow(tax.clean)){
# Fill in missing taxonomy
if (tax.clean[i,2] == ""){
kingdom <- paste("Kingdom_", tax.clean[i,1], sep = "")
tax.clean[i, 2:7] <- kingdom
} else if (tax.clean[i,3] == ""){
phylum <- paste("Phylum_", tax.clean[i,2], sep = "")
tax.clean[i, 3:7] <- phylum
} else if (tax.clean[i,4] == ""){
class <- paste("Class_", tax.clean[i,3], sep = "")
tax.clean[i, 4:7] <- class
} else if (tax.clean[i,5] == ""){
order <- paste("Order_", tax.clean[i,4], sep = "")
tax.clean[i, 5:7] <- order
} else if (tax.clean[i,6] == ""){
family <- paste("Family_", tax.clean[i,5], sep = "")
tax.clean[i, 6:7] <- family
} else if (tax.clean[i,7] == ""){
tax.clean$Species[i] <- paste("Genus_",tax.clean$Genus[i], sep = "_")
}
}
rm(bad, class, family, i, kingdom,new,order,phylum,uncul)
tax_table(all) <- as.matrix(tax.clean)
all
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 7521 taxa and 168 samples ]
## sample_data() Sample Data: [ 168 samples by 12 sample variables ]
## tax_table() Taxonomy Table: [ 7521 taxa by 7 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 7521 tips and 7465 internal nodes ]
## refseq() DNAStringSet: [ 7521 reference sequences ]
all.clean <- subset_taxa(all,
Kingdom != "Unassigned" &
Kingdom != "Archaea" &
Kingdom != "Eukaryota" &
Family != "Mitochondria" &
Order != "Chloroplast" &
Genus != "Ralstonia"
) %>%
prune_taxa(taxa_sums(.) > 0, .)
all.clean
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 6865 taxa and 168 samples ]
## sample_data() Sample Data: [ 168 samples by 12 sample variables ]
## tax_table() Taxonomy Table: [ 6865 taxa by 7 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 6865 tips and 6817 internal nodes ]
## refseq() DNAStringSet: [ 6865 reference sequences ]
Rarefaction plot
sample_data(all.clean)$New_Diet <- factor(sample_data(all.clean)$New_Diet, levels=c('ext-ctrl', 'CTR', 'MC1', 'MC2', 'MN3'))
annotation_colors <- list(
samplingTime = c(`T0`="gray" ,`T1` = "#FF7F00", `T2` = "#FFD92F", `T3`="#F781BF"),
New_Diet = c(`ext-ctrl`="#FDC086", `CTR`="#e41a1cff", `MC1`="#377EB8", `MC2`="#4DAF4A", `MN3`="#984EA3"))
p <- ggrare(all.clean, step = 1000,
color = "samplingTime",
se = F,
parallel = TRUE,
plot = FALSE)
cols <- c(brewer.pal(8,"Set1"), brewer.pal(7,"Dark2"),brewer.pal(7,"Set2"),brewer.pal(12,"Set3"),brewer.pal(7,"Accent"),brewer.pal(12,"Paired"),"gray")
p <- p + theme_bw() +
scale_fill_manual(values =annotation_colors$samplingTime) +
scale_colour_manual( values = annotation_colors$samplingTime) +
facet_wrap(~New_Diet) +
guides(color = guide_legend(title = "Life stage"))
Alpha and Beta diversity
shannon.div <- estimate_richness(all.clean, measures = c("Shannon", "Observed"))
sampledata1<- data.frame(sample_data(all.clean))
row.names(shannon.div) <- gsub("[.]","-", row.names(shannon.div))
sampleData <- merge(sampledata1, shannon.div, by = 0 , all = TRUE)
my_comparisons <- list( c("ext-ctrl", "MC1"), c("ext-ctrl", "MC2"), c("ext-ctrl", "MN3"), c("CTR", "MC1"), c("CTR", "MC2"), c("CTR", "MN3"))
p1 <- ggboxplot(sampleData, x = "New_Diet", y = "Observed",
color = "black", palette = "jco", legend = "none", outlier.shape = NA)+
stat_compare_means(comparisons = my_comparisons) +
stat_compare_means(label.y = 350) +
ggplot2::geom_jitter(
mapping = aes_string(fill="New_Diet"),
position = ggplot2::position_jitter(seed=123),
shape = 21,
color = "black",
size = 2,
alpha = 0.8
) +
theme_bw() +
theme(legend.position="none",axis.title.x=element_blank()) +
scale_fill_manual(values = annotation_colors$New_Diet) +
scale_colour_manual( values = annotation_colors$New_Diet) +
labs(y= "Observed richness (ASVs)")
set.seed(1)
PCoA_bray <- ordinate(physeq = all.clean, method = "PCoA", distance = "bray")
PCoA_bray_plot<- plot_ordination(
physeq = all.clean,
ordination = PCoA_bray,
color = "New_Diet"
) +
geom_point(aes(fill ="New_Diet" ), size =2) +
geom_point(shape = 1, size = 2,colour = "black") +
theme_bw() +
xlab("PCoA 1 [18.2 %]") + ylab("PCoA 2 [8.6 %]") +
stat_ellipse() +
scale_fill_manual(values = annotation_colors$New_Diet) +
scale_colour_manual( values = annotation_colors$New_Diet) +
guides(color = guide_legend(title = "Diet"))
bottom_row <- plot_grid(p1, PCoA_bray_plot, labels = c('B', 'C'), align = 'h', rel_widths = c(1, 1.3))
Supplementary Fig. S4
Computes the Bray-Curtis distance between all the samples in the dataset “all.clean” and tests for differences between samples in the variable “New_Diet” using the Adonis2 function. The Adonis2 function was run with 9999 permutations.
sampledf <- data.frame(sample_data(all.clean))
bcdist <- phyloseq::distance(all.clean, method="bray",normalized=TRUE)
adonis2(bcdist ~ New_Diet,
data = sampledf, permutations = 9999)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 9999
##
## adonis2(formula = bcdist ~ New_Diet, data = sampledf, permutations = 9999)
## Df SumOfSqs R2 F Pr(>F)
## New_Diet 4 3.395 0.06681 2.9174 0.0001 ***
## Residual 163 47.417 0.93319
## Total 167 50.812 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Contamination removal
df <- as.data.frame(sample_data(all.clean)) # Put sample_data into a ggplot-friendly data.frame
df$Sample_or_Control <- ifelse( df$New_Diet %in% c("ext-ctrl"), "Control_Sample", "True_Sample")
sample_data(all.clean) <- df
df$LibrarySize <- sample_sums(all.clean)
df <- df[order(df$LibrarySize),]
df$Index <- seq(nrow(df))
sample_data(all.clean)$is.neg <- sample_data(all.clean)$Sample_or_Control == "Control_Sample"
contamdf.prev05 <- isContaminant(all.clean, method="prevalence", neg="is.neg", threshold=0.5)
paste("The number of contamination found is",table(contamdf.prev05$contaminant)[2])
## [1] "The number of contamination found is 86"
all.noncontam <- prune_taxa(!contamdf.prev05$contaminant, all.clean)
psdata <- subset_samples(all.noncontam, Sample_or_Control=="True_Sample")
psdata <- prune_taxa(taxa_sums(psdata) > 0, psdata)
psdata
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 6468 taxa and 156 samples ]
## sample_data() Sample Data: [ 156 samples by 14 sample variables ]
## tax_table() Taxonomy Table: [ 6468 taxa by 7 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 6468 tips and 6426 internal nodes ]
## refseq() DNAStringSet: [ 6468 reference sequences ]
Inspect the number of reads per sample and compare to rarefaction curves
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 6468 taxa and 156 samples ]
## sample_data() Sample Data: [ 156 samples by 14 sample variables ]
## tax_table() Taxonomy Table: [ 6468 taxa by 7 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 6468 tips and 6426 internal nodes ]
## refseq() DNAStringSet: [ 6468 reference sequences ]
Taxonomic classification- Phylum level taxonomic distribution. Bars report the mean abundance for each individual sample.
psdata.r<- transform_sample_counts(psdata.p, function(x) x / sum(x) )
Final.RNA <- aggregate_rare(psdata.r, level = "Phylum", detection = 1/100, prevalence = 20/100)
getPalette = colorRampPalette(brewer.pal(10, "Dark2"))
PhylaPalette = getPalette(10)
Final.RNA_phylum_plot<- plot_composition(Final.RNA, sample.sort = "Proteobacteria",otu.sort = "abundance", verbose = TRUE)
Final.RNA_phylum_plot <- Final.RNA_phylum_plot +
theme_bw() +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
scale_fill_manual(values = PhylaPalette)
Final.RNA_phylum_plot
#Bacterial Community Composition for Manuscript
Final.seq.melt.RNA <- psmelt(tax_glom(psdata.r, "Species"))
tax_ranks <- c("Phylum", "Class", "Order", "Family", "Genus", "Species")
for (rank in tax_ranks) {
n_unique <- length(unique(Final.seq.melt.RNA[[rank]]))
message(paste(rank, ": ", n_unique, sep = ""))
}
## [1] "number of unique Phylum is 44"
paste("number of unique Class is", table(grepl("Kingdom|Phylum", unique(Final.seq.melt.RNA$Class)))[1])
## [1] "number of unique Class is 106"
paste("number of unique Order is", table(grepl("Kingdom|Class|Phylum", unique(Final.seq.melt.RNA$Order)))[1])
## [1] "number of unique Order is 243"
paste("number of unique Family is", table(grepl("Kingdom|Order|Class|Phylum", unique(Final.seq.melt.RNA$Family)))[1])
## [1] "number of unique Family is 396"
paste("number of unique Genus is", table(grepl("Kingdom|Family|Order|Class|Phylum", unique(Final.seq.melt.RNA$Genus)))[1])
## [1] "number of unique Genus is 839"
paste("number of unique Species is", table(grepl("Kingdom|Family|Order|Class|Phylum|Genus", unique(Final.seq.melt.RNA$Species)))[1])
## [1] "number of unique Species is 419"
Supplementary table for the manuscript
Phylum_df <- summaryBy(Abundance~Phylum, data=Final.seq.melt.RNA, FUN=sum)
Phylum_df$Percent <- round(Phylum_df$Abundance.sum/sum(Phylum_df$Abundance.sum)*100, 4)
Phylum_df <- plyr::arrange(Phylum_df, plyr::desc(Percent))
Phylum_df$PercentageRound <- round(Phylum_df$Percent, digits = 2)
DT::datatable(Phylum_df)
class_df <- summaryBy(Abundance~Class, data=Final.seq.melt.RNA, FUN=sum)
class_df$Percent <- round(class_df$Abundance.sum/sum(class_df$Abundance.sum)*100, 4)
class_df <- plyr::arrange(class_df, plyr::desc(Percent))
class_df$PercentageRound <- round(class_df$Percent, digits = 2)
DT::datatable(class_df)
order_df <- summaryBy(Abundance~Order, data=Final.seq.melt.RNA, FUN=sum)
order_df$Percent <- round(order_df$Abundance.sum/sum(order_df$Abundance.sum)*100, 4)
order_df <- plyr::arrange(order_df, plyr::desc(Percent))
order_df$PercentageRound <- round(order_df$Percent, digits = 2)
DT::datatable(order_df)
family_df <- summaryBy(Abundance~Family, data=Final.seq.melt.RNA, FUN=sum)
family_df$Percent <- round(family_df$Abundance.sum/sum(family_df$Abundance.sum)*100, 4)
family_df <- plyr::arrange(family_df, plyr::desc(Percent))
family_df$PercentageRound <- round(family_df$Percent, digits = 2)
DT::datatable(family_df)
genus_df <- summaryBy(Abundance~Genus, data=Final.seq.melt.RNA, FUN=sum)
genus_df$Percent <- round(genus_df$Abundance.sum/sum(genus_df$Abundance.sum)*100, 4)
genus_df <- plyr::arrange(genus_df, plyr::desc(Percent))
genus_df$Round <- round(genus_df$Percent, digits = 2)
DT::datatable(genus_df)
Supplementary Figure
FigureS4_1 <- transform_sample_counts(psdata.r, function(x) x / sum(x) )
FigureS4_1 <- aggregate_top_taxa(FigureS4_1, level = "Phylum", top = 10)
getPalette = colorRampPalette(brewer.pal(10, "Dark2"))
PhylaPalette = getPalette(10)
cols <- c(brewer.pal(8,"Set1"), brewer.pal(7,"Dark2"),brewer.pal(7,"Set2"),brewer.pal(12,"Set3"),brewer.pal(7,"Accent"),brewer.pal(12,"Paired"),"gray")
FigureS4_1_plot<- plot_composition(FigureS4_1, sample.sort = "Proteobacteria",otu.sort = "abundance", verbose = TRUE)
FigureS4_1_plot <- FigureS4_1_plot +
theme_bw() +
#theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
scale_fill_manual(values = cols) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+ ylab("Relative abundance")+ labs(fill = "Taxa") +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank())
FigureS4_1 <- transform_sample_counts(psdata.r, function(x) x / sum(x) )
FigureS4_2 <- aggregate_top_taxa(FigureS4_1, level = "Class", top = 10)
getPalette = colorRampPalette(brewer.pal(10, "Dark2"))
PhylaPalette = getPalette(10)
cols <- c(brewer.pal(8,"Set1"), brewer.pal(7,"Dark2"),brewer.pal(7,"Set2"),brewer.pal(12,"Set3"),brewer.pal(7,"Accent"),brewer.pal(12,"Paired"),"gray")
FigureS4_2_plot<- plot_composition(FigureS4_2, sample.sort = "Gammaproteobacteria",otu.sort = "abundance", verbose = TRUE)
FigureS4_2_plot <- FigureS4_2_plot +
theme_bw() +
#theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
scale_fill_manual(values = cols) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+ ylab("Relative abundance")+ labs(fill = "Taxa") +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank())
FigureS4_1 <- transform_sample_counts(psdata.r, function(x) x / sum(x) )
FigureS4_3 <- aggregate_top_taxa(FigureS4_1, level = "Family", top = 10)
getPalette = colorRampPalette(brewer.pal(10, "Dark2"))
PhylaPalette = getPalette(10)
cols <- c(brewer.pal(8,"Set1"), brewer.pal(7,"Dark2"),brewer.pal(7,"Set2"),brewer.pal(12,"Set3"),brewer.pal(7,"Accent"),brewer.pal(12,"Paired"),"gray")
FigureS4_3_plot<- plot_composition(FigureS4_3, sample.sort = "Other",otu.sort = "abundance", verbose = TRUE)
FigureS4_3_plot <- FigureS4_3_plot +
theme_bw() +
#theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
scale_fill_manual(values = cols) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ylab("Relative abundance")+ labs(fill = "Taxa") +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank())
FigureS4_1 <- transform_sample_counts(psdata.r, function(x) x / sum(x) )
FigureS4_4 <- aggregate_top_taxa(FigureS4_1, level = "Genus", top = 10)
getPalette = colorRampPalette(brewer.pal(10, "Dark2"))
PhylaPalette = getPalette(10)
cols <- c(brewer.pal(8,"Set1"), brewer.pal(7,"Dark2"),brewer.pal(7,"Set2"),brewer.pal(12,"Set3"),brewer.pal(7,"Accent"),brewer.pal(12,"Paired"),"gray")
FigureS4_4_plot<- plot_composition(FigureS4_4, sample.sort = "Other",otu.sort = "abundance", verbose = TRUE)
FigureS4_4_plot <- FigureS4_4_plot +
theme_bw() +
#theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
scale_fill_manual(values = cols) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+ ylab("Relative abundance")+ labs(fill = "Taxa") +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank())
plot_grid(FigureS4_1_plot, FigureS4_3_plot, FigureS4_2_plot, FigureS4_4_plot, labels = 'AUTO', ncol = 2, rel_widths = c(1,1.1,1,1.1))
Alpha and Beta diversity
shannon.div <- estimate_richness(psdata.p, measures = c("Shannon", "Observed"))
sampledata1<- data.frame(sample_data(psdata.p))
row.names(shannon.div) <- gsub("[.]","-", row.names(shannon.div))
sampleData <- merge(sampledata1, shannon.div, by = 0 , all = TRUE)
sampleData <- sampleData %>%
mutate(New_Diet = case_when(
samplingTime == "T0" ~ "start",
TRUE ~ New_Diet
))
sampleData$New_Diet <- factor(sampleData$New_Diet, levels=c( 'CTR', 'MC1', 'MC2', 'MN3'))
sampleData$samplingTime <- factor(sampleData$samplingTime, levels=c('T0', 'T1', 'T2', 'T3'))
my_comparisons <- list( c("CTR", "MC1"), c("CTR", "MC2"), c("CTR", "MN3"),
c("MC1", "MC2"), c("MC1", "MN3"), c("MC2", "MN3") )
plot1 <- ggboxplot(sampleData, x = "New_Diet", y = "Shannon",
color = "black", palette = "jco", legend = "none", outlier.shape = NA)+
geom_pwc(comparisons = my_comparisons, hide.ns = TRUE) +
#stat_compare_means(label.y = 5) +
ggplot2::geom_jitter(
mapping = aes_string(fill="New_Diet"),
position = ggplot2::position_jitter(seed=123),
shape = 21,
color = "black",
size = 2,
alpha = 0.8
) +
theme_bw() + scale_y_continuous("Shannon index") +
theme(legend.position="none",axis.title.x=element_blank()) +
scale_fill_manual(values = annotation_colors$New_Diet) +
scale_colour_manual( values = annotation_colors$New_Diet) +
facet_grid(. ~ samplingTime, scales="free_x", space = "free_x")
my_comparisons <- list(c("T0", "T3"), c("T1", "T3"), c("T2", "T3"))
# Assuming your dataframe is named sampleData
sampleData$New_Diet[sampleData$New_Diet == "start"] <- "CTR"
plot2_x <- ggboxplot(sampleData, x = "samplingTime", y = "Shannon",
color = "black", palette = "jco", legend = "none", outlier.shape = NA)+
stat_compare_means(comparisons = my_comparisons) +
#stat_compare_means(label.y = 5) +
ggplot2::geom_jitter(
mapping = aes_string(fill="samplingTime"),
position = ggplot2::position_jitter(seed=123),
shape = 21,
color = "black",
size = 2,
alpha = 0.8
) +
theme_bw() +
theme(legend.position="none",axis.title.x=element_blank()) +
scale_fill_manual(values = annotation_colors$samplingTime) +
scale_colour_manual( values = annotation_colors$samplingTime) +
facet_grid(. ~ New_Diet) +
labs(y= "Shannon index") + ylim(c(1.5, 5.2))
set.seed(1)
PCoA_bray <- ordinate(physeq = psdata.p, method = "PCoA", distance = "bray")
plot3 <- plot_ordination(
physeq = psdata.p,
ordination = PCoA_bray,
color = "samplingTime"
) +
geom_point(aes(fill ="samplingTime" ), size =2) +
geom_point(shape = 1, size = 2,colour = "black") +
theme_bw() + ggtitle("PCoA Plot - Bray") +
theme(plot.title = element_text(hjust = 0.5)) +
ggtitle("Hindgut: Microbiome") +
xlab("PCoA 1 [21.1 %]") + ylab("PCoA 2 [7.7 %]") +
stat_ellipse() +
scale_fill_manual(values = annotation_colors$samplingTime) +
scale_colour_manual( values = annotation_colors$samplingTime) +
guides(color = guide_legend(title = "Time"))
# Run adonis test
sampledf <- data.frame(sample_data(psdata.p))
bcdist <- phyloseq::distance(psdata.p, method="bray",normalized=TRUE)
adonis2(bcdist ~ samplingTime, data = sampledf, permutations = 9999)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 9999
##
## adonis2(formula = bcdist ~ samplingTime, data = sampledf, permutations = 9999)
## Df SumOfSqs R2 F Pr(>F)
## samplingTime 3 8.711 0.20536 13.094 0.0001 ***
## Residual 152 33.709 0.79464
## Total 155 42.420 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
set.seed(1)
plot4 <- plot_ordination(
physeq = psdata.p,
ordination = PCoA_bray,
color = "New_Diet"
) +
geom_point(aes(fill ="New_Diet" ), size =2) +
geom_point(shape = 1, size = 2,colour = "black") +
theme_bw() + ggtitle("PCoA Plot - Bray") +
theme(plot.title = element_text(hjust = 0.5)) +
ggtitle("Hindgut: Microbiome") +
xlab("PCoA 1 [21.1 %]") + ylab("PCoA 2 [7.7 %]") +
stat_ellipse() +
scale_fill_manual(values = annotation_colors$New_Diet) +
scale_colour_manual( values = annotation_colors$New_Diet) +
guides(color = guide_legend(title = "Diet"))
# Run adonis test
sampledf <- data.frame(sample_data(psdata.p))
adonis2(bcdist ~ New_Diet, data = sampledf, permutations = 9999)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 9999
##
## adonis2(formula = bcdist ~ New_Diet, data = sampledf, permutations = 9999)
## Df SumOfSqs R2 F Pr(>F)
## New_Diet 3 0.914 0.02155 1.116 0.2255
## Residual 152 41.506 0.97845
## Total 155 42.420 1.00000
# plot5 <- plot_grid(plot1, plot2, labels = c('', ''), label_size = 12, ncol = 1, align = "hv")
# plot6 <- plot_grid(plot3, plot4, labels = c('', ''), label_size = 12, ncol = 1, align = "hv")
# plot_grid(plot5, plot6, ncol = 2, rel_widths = c(1.5, 1))
psdata1 <- subset_samples(psdata, samplingTime != "T0")
table_HM <- rabutable(psdata1, predictor= "New_Diet", type = "Genus", facet_wrap ="samplingTime", N_taxa = 20)
data = table_HM %>%
mutate(ID = paste0(variable, ":", wrap, "_", predictor2)) %>%
group_by(ID) %>%
summarize(meanvalue = mean(log(value))) %>%
mutate(feedtime = sub('.*:', '', ID)) %>%
mutate(ID = sub(':.*', '', ID)) %>%
pivot_wider(names_from = feedtime, values_from = meanvalue)
rabuplot_df_genus <- as.data.frame(data)
rownames(rabuplot_df_genus) <- data$ID
rabuplot_df_genus <- rabuplot_df_genus[,-1]
heatmap_colors <- colorRampPalette(c("#18b29f","#FFFFFF","#ac6721"), interpolate = "spline", space = "rgb")(51)
annotation_col <-
colnames(rabuplot_df_genus) %>%
as.data.frame() %>%
dplyr::rename(common = ".") %>%
tidyr::separate(col = common, into = c("Time", "Diet"), remove = F, extra = "drop") %>% # If replicates are present, we drop them here
dplyr::select(common,Time, Diet) %>%
tibble::column_to_rownames(var = "common")
annotation_colors_1 <- list(
Time = c(`T1` = "#FF7F00", `T2` = "#FFD92F", `T3`="#F781BF"),
Diet = c(`CTR`="#e41a1cff", `MC1`="#377EB8", `MC2`="#4DAF4A", `MN3`="#984EA3"))
row.names(rabuplot_df_genus)[3] <- "BCP group"
plot7 <- pheatmap(t(rabuplot_df_genus),
annotation_row = annotation_col,
annotation_colors = annotation_colors_1,
color = heatmap_colors,
scale = "row",
cluster_rows = F,
cluster_cols = T,
gaps_row = c(4,8),
show_colnames = T,
show_rownames = F)
Import data
# Replace this base URL with the raw content URL of your GitHub repository
base_url <- "https://raw.githubusercontent.com/shashank-KU/ImprovaFish-MDF-Effects/main/"
# Specify the paths to your files on GitHub
host_gut_rawCounts_url <- paste0(base_url, "Transcriptomics/low_dosage_trial/host_gut_count.txt")
sample_info_url <- paste0(base_url, "Transcriptomics/low_dosage_trial/sample_info.csv")
# Read the files
host_gut_rawCounts_total1 <- read.table(url(host_gut_rawCounts_url))
sample_info <- read.csv(url(sample_info_url), row.names = 1)
omics_data_host <- host_gut_rawCounts_total1
host_gut_mapping_file <- sample_info
Initial formatting Main idea of this chunk of code is make sure we
have same number of samples in countData
and samples in
colData
for DESeq2
. Also the order of the
samples in colData
need to match the order of the samples
in countData
sample_info$ID_New <- paste(sample_info$Time, "_",
sample_info$New_Diet, "_",
sample_info$Tank_number, "_", "0",
sample_info$Sample_Number,
sep = "")
head_change <- subset(sample_info, select=c("common" ,"ID_New"))
row.names(sample_info) <- sample_info$ID_New
omics_data_host1 <- omics_data_host[, colnames(omics_data_host) %in% sample_info$ID_New]
omics_data_host <- omics_data_host1
# rename colnames of omics_data_host using new ID
df1 <- subset(sample_info, select= c("common", "ID_New"))
omics_data_host %>%
rename_with(~deframe(df1)[.x], .cols = df1$ID_New) %>%
select(any_of(df1$ID_New)) -> omics_data_host_new
omics_data_host <- omics_data_host_new
reorder_idx <- match(sample_info$ID_New, colnames(omics_data_host))
omics_data_host <- omics_data_host[ , reorder_idx]
all(colnames(omics_data_host) == sample_info$ID_New)
## [1] TRUE
DESeq2 analysis
df <- round(omics_data_host) %>%
# The next steps require a data frame and round() returns a matrix
as.data.frame() %>%
# Only keep rows that have total counts above the cutoff
dplyr::filter(rowSums(.) >= 0)
#all(colnames(df) == rownames(sample_info))
dds <- DESeqDataSetFromMatrix(
countData = df, # Our prepped data frame with counts
colData = sample_info, # Data frame with annotation for our samples
design = ~1 # Here we are not specifying a model
)
dds$group <- factor(paste0(dds$Time, dds$New_Diet))
design(dds) <- ~ group-1
dds_trial_1 <- DESeq(dds, parallel = T)
resultsNames(dds_trial_1 )
annot_function <- read_tsv("/Users/shashankgupta/Desktop/ImprovAFish/Salmo_salar-GCA_905237065.2_gene_annotations.tsv")
annot_function <- annot_function[, c("gene_id", "v2.gene_id.NCBI","v2.gene_name.ensembl", "v2.product")]
vst <- vst(dds_trial_1)
pcaPlot <- DESeq2::plotPCA(vst, intgroup = c("Time", "New_Diet", "group"), ntop = nrow(dds_trial_1), returnData = TRUE)
percentVar <- round(100 * attr(pcaPlot, "percentVar"))
plot8<- ggplot(pcaPlot, aes(PC1, PC2, color=New_Diet)) +
geom_point(aes(fill ="New_Diet" ), size =2.5) +
geom_point(shape = 1, size = 2.5,colour = "black") +
theme_bw() + ggtitle("Hindgut: Host transcriptomics") +
theme(plot.title = element_text(hjust = 0.5)) +
xlab("PC 1 [12 %]") + ylab("PC 2 [6 %]") +
stat_ellipse() + scale_fill_manual(values =annotation_colors$New_Diet) +
scale_colour_manual( values = annotation_colors$New_Diet) +
guides(color = guide_legend(title = "Diet"))
plot8.1<- ggplot(pcaPlot, aes(PC1, PC2, color=Time)) +
geom_point(aes(fill ="Time" ), size =2.5) +
geom_point(shape = 1, size = 2.5,colour = "black") +
theme_bw() + ggtitle("Hindgut: Host transcriptomics") +
theme(plot.title = element_text(hjust = 0.5)) +
xlab("PC 1 [12 %]") + ylab("PC 2 [6 %]") +
stat_ellipse() + scale_fill_manual(values =annotation_colors$samplingTime) +
scale_colour_manual( values = annotation_colors$samplingTime) +
guides(color = guide_legend(title = "Life stage"))
# Perform PERMANOVA using adonis2
dist_matrix <- dist(pcaPlot[, c("PC1", "PC2")])
permanova_result <- adonis2(dist_matrix ~ Time, data = pcaPlot, permutations = 9999)
permanova_result
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 9999
##
## adonis2(formula = dist_matrix ~ Time, data = pcaPlot, permutations = 9999)
## Df SumOfSqs R2 F Pr(>F)
## Time 2 108426 0.36678 40.256 0.0001 ***
## Residual 139 187194 0.63322
## Total 141 295620 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Import data and intial preprocessing
# Specify the path to your RDS file on GitHub
rds_file <- "Metatranscriptomics/low_dosage_trial/dds_metatranscriptomics.rds"
rds_url <- paste0("https://raw.githubusercontent.com/shashank-KU/ImprovaFish-MDF-Effects/main/", rds_file)
# Read the RDS file
dds_metatranscriptomics <- readRDS(url(rds_url))
# Filter out T0 time point
dds_metatranscriptomics <- dds_metatranscriptomics[ , dds_metatranscriptomics$TimePoint != "T0"]
# Prepare data for analysis
dds_metatranscriptomics$New_Diet <- toupper(dds_metatranscriptomics$Treatment)
dds_metatranscriptomics$Time <- toupper(dds_metatranscriptomics$TimePoint)
dds_metatranscriptomics$group <- factor(paste0(dds_metatranscriptomics$Time, dds_metatranscriptomics$New_Diet))
design(dds_metatranscriptomics) <- ~ group-1
# Calculate gene sums and filter out low-expressed genes
gene_sums <- rowSums(counts(dds_metatranscriptomics, normalized = FALSE))
dds_filtered <- dds_metatranscriptomics[gene_sums >= 50,]
# Perform DESeq analysis
dds_metatranscriptomics <- DESeq(dds_filtered, parallel = TRUE)
# Explore results
resultsNames(dds_metatranscriptomics)
## [1] "groupT1CTR" "groupT1MC1" "groupT1MC2" "groupT1MN3" "groupT2CTR"
## [6] "groupT2MC1" "groupT2MC2" "groupT2MN3" "groupT3CTR" "groupT3MC1"
## [11] "groupT3MC2" "groupT3MN3"
res <- results(dds_metatranscriptomics)
# Count and print the number of differentially expressed genes with adjusted p-value < 0.05
cat("Number of differentially expressed microbial genes (padj < 0.05):", sum(res$padj < 0.05), "\n")
## Number of differentially expressed microbial genes (padj < 0.05): NA
##
## out of 82484 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 3575, 4.3%
## LFC < 0 (down) : 178, 0.22%
## outliers [1] : 0, 0%
## low counts [2] : 44779, 54%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# Order results by adjusted p-value
res <- res[order(res$padj),]
# Filter and arrange results for display
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
filter(padj < 0.05) %>%
arrange(padj)
# Perform variance stabilizing transformation (VST)
vst_data <- vst(dds_metatranscriptomics)
# Generate PCA plot
pcaPlot <- DESeq2::plotPCA(vst_data, intgroup = c("Time", "New_Diet", "group"), ntop = nrow(dds_trial_1), returnData = TRUE)
# Extract percentage of variance explained by PCs
percentVar <- round(100 * attr(pcaPlot, "percentVar"))
# Create PCA plot using ggplot2
plot9 <- ggplot(pcaPlot, aes(PC1, PC2, color=New_Diet)) +
geom_point(aes(fill ="New_Diet" ), size =2.5) +
geom_point(shape = 1, size = 2.5,colour = "black") +
theme_bw() + ggtitle("Hindgut: Metatranscriptomics") +
theme(plot.title = element_text(hjust = 0.5)) +
xlab(paste("PC 1 [", percentVar[1], "%]")) + ylab(paste("PC 2 [", percentVar[2], "%]")) +
stat_ellipse() + scale_fill_manual(values = annotation_colors$New_Diet) +
scale_colour_manual( values = annotation_colors$New_Diet) +
guides(color = guide_legend(title = "Diet"))
contrasts <- c("T1MC1", "T1MC2", "T1MN3")
for (contrast in contrasts) {
contrast_group <- paste0("group", contrast)
contrast_levels <- list(contrast_group, "groupT1CTR")
res <- results(dds_metatranscriptomics, contrast = contrast_levels)
# Extract the number of differentially expressed genes with adjusted p-value < 0.05
num_de_genes <- sum(res$padj < 0.05, na.rm = TRUE)
# Print informative message
cat("Number of differentially expressed microbial genes at time point 1 (T1)", "for CTR vs", contrast_group, "is", num_de_genes, "\n")
}
## Number of differentially expressed microbial genes at time point 1 (T1) for CTR vs groupT1MC1 is 25
## Number of differentially expressed microbial genes at time point 1 (T1) for CTR vs groupT1MC2 is 24
## Number of differentially expressed microbial genes at time point 1 (T1) for CTR vs groupT1MN3 is 12
contrasts <- c("T2MC1", "T2MC2", "T2MN3")
for (contrast in contrasts) {
contrast_group <- paste0("group", contrast)
contrast_levels <- list(contrast_group, "groupT2CTR")
res <- results(dds_metatranscriptomics, contrast = contrast_levels)
# Extract the number of differentially expressed genes with adjusted p-value < 0.05
num_de_genes <- sum(res$padj < 0.05, na.rm = TRUE)
# Print informative message
cat("Number of differentially expressed microbial genes at time point 2 (T2)", contrast, "for CTR vs", contrast_group, "is", num_de_genes, "\n")
}
## Number of differentially expressed microbial genes at time point 2 (T2) T2MC1 for CTR vs groupT2MC1 is 7
## Number of differentially expressed microbial genes at time point 2 (T2) T2MC2 for CTR vs groupT2MC2 is 35
## Number of differentially expressed microbial genes at time point 2 (T2) T2MN3 for CTR vs groupT2MN3 is 33
contrasts <- c("T3MC1", "T3MC2", "T3MN3")
for (contrast in contrasts) {
contrast_group <- paste0("group", contrast)
contrast_levels <- list(contrast_group, "groupT3CTR")
res <- results(dds_metatranscriptomics, contrast = contrast_levels)
# Extract the number of differentially expressed genes with adjusted p-value < 0.05
num_de_genes <- sum(res$padj < 0.05, na.rm = TRUE)
# Print informative message
cat("Number of differentially expressed microbial genes at time point 3 (T3)", contrast, "for CTR vs", contrast_group, "is", num_de_genes, "\n")
}
## Number of differentially expressed microbial genes at time point 3 (T3) T3MC1 for CTR vs groupT3MC1 is 25
## Number of differentially expressed microbial genes at time point 3 (T3) T3MC2 for CTR vs groupT3MC2 is 21
## Number of differentially expressed microbial genes at time point 3 (T3) T3MN3 for CTR vs groupT3MN3 is 24
plot_left <- plot_grid(plot2, plot1, plot7$gtable, align = "hv", ncol = 1, rel_heights = c(.6,.6,1.5), labels = c('B', 'C', 'G'))
plot_right <- plot_grid(plot3, plot4, plot8, plot9, align = "hv", ncol = 1, rel_widths = c(1,1,1,1,1), labels = c('D', 'E', 'F', 'H' ))
plot_grid(plot_left, plot_right, align = "hv", rel_widths = c(2,1))