library("ranacapa")
library("phyloseq")
library("ggplot2")
library("stringr")
library("plyr")
library("reshape2")
library("reshape")
library("dplyr")
library("tidyr")
library("doBy")
library("plyr")
library("microbiome")
library("ggpubr")
library("vegan")
library("tidyverse")
library("magrittr")
library("cowplot")
library("dendextend")
library("WGCNA")
library("metagenomeSeq")
library("decontam")
library("RColorBrewer")
library("ampvis2")
library("ggpubr")
library("formatR")
library("DT")
setwd("/Users/shashankgupta/Desktop/ImprovAFish/github/ImprovaFish/Metagenomics")
raw <- import_biom("/Users/shashankgupta/Desktop/ImprovAFish/exported-feature-table/feature-table_taxonomy.biom")
tree <- read_tree("/Users/shashankgupta/Desktop/ImprovAFish/exported-feature-table/tree.nwk")
refseq <- Biostrings::readDNAStringSet("/Users/shashankgupta/Desktop/ImprovAFish/exported-feature-table/dna-sequences.fasta", use.names = TRUE)
dat <- read.table("/Users/shashankgupta/Desktop/ImprovAFish/metadata.txt", header = TRUE,row.names = 1, sep = "\t")
# Merge into one complete phyloseq object
all <- merge_phyloseq(raw, sample_data(dat), tree, refseq)
tax <- data.frame(tax_table(all), stringsAsFactors = FALSE)
tax <- tax[,1:7] # No info in col 8-15
# Set informative colnames
colnames(tax) <- c("Kingdom", "Phylum","Class","Order","Family","Genus", "Species")
tax.clean <- data.frame(row.names = row.names(tax),
Kingdom = str_replace(tax[,1], "d__",""),
Phylum = str_replace(tax[,2], "p__",""),
Class = str_replace(tax[,3], "c__",""),
Order = str_replace(tax[,4], "o__",""),
Family = str_replace(tax[,5], "f__",""),
Genus = str_replace(tax[,6], "g__",""),
Species = str_replace(tax[,7], "s__",""),
stringsAsFactors = FALSE)
tax.clean[is.na(tax.clean)] <- ""
# - Clean rank by rank
# Kingdom - Remove the unassigned completely
# Phylum
# Class
# Remove extra info about origin from some bacteria
# Remove all fields that contain "uncultured", "Unknown" or "Ambigious"
bad <- c("Ambiguous_taxa","uncultured", "Subgroup_21")
tax.clean[tax.clean$Class %in% bad,3:7] <- ""
# Order
bad <- c("0319-6G20","1-20","11-24", "ADurb.Bin180","D8A-2", "Group_1.1c", "JGI_0000069-P22","Marine_Group_II",
"Pla3_lineage","Run-SP154",
"Ambiguous_taxa", "uncultured", "UBA10353_marine_group",
"Subgroup_17", "SAR86_clade", "SAR11_clade", "SAR202_clade")
tax.clean[tax.clean$Order %in% bad,4:7] <- ""
# Family
bad <- c("Ambiguous_taxa","11-24","67-14", "uncultured", "SAR116_clade", "Run-SP154",
"Marine_Group_II", "env.OPS_17", "SAR116_clade", "S085", "S-70", "NS9_marine_group", "Mitochondria")
tax.clean[tax.clean$Family %in% bad,5:7] <- ""
# Genus
bad <- c("Ambiguous_taxa","Unknown_Family","uncultured","Subgroup_10", "1174-901-12", "67-14")
tax.clean[tax.clean$Genus %in% bad,6:7] <- ""
# Species
bad <- c("Ambiguous_taxa","marine_metagenome","low_GC","wastewater_metagenome","unidentified",
"uncultured_synthetic", "uncultured_organism")
tax.clean[tax.clean$Species %in% bad,6:7] <- ""
#tax.clean[grepl("uncultured", tax.clean$Species),"Species"] <- ""
#tax.clean[grepl("unidentified", tax.clean$Species),"Species"] <- ""
# Remove remove ".", change "-" and " " to "_"
for (i in 1:ncol(tax.clean)){
tax.clean[,i] <- str_replace_all(tax.clean[,i], "[.]","")
tax.clean[,i] <- str_replace_all(tax.clean[,i], "[(]","")
tax.clean[,i] <- str_replace_all(tax.clean[,i], "[)]","")
tax.clean[,i] <- str_replace_all(tax.clean[,i], "-","_")
tax.clean[,i] <- str_replace_all(tax.clean[,i], " ","_")
}
for (i in 1:7){ tax.clean[,i] <- as.character(tax.clean[,i])}
# File holes in the tax table
for (i in 1:nrow(tax.clean)){
# Fill in missing taxonomy
if (tax.clean[i,2] == ""){
kingdom <- paste("Kingdom_", tax.clean[i,1], sep = "")
tax.clean[i, 2:7] <- kingdom
} else if (tax.clean[i,3] == ""){
phylum <- paste("Phylum_", tax.clean[i,2], sep = "")
tax.clean[i, 3:7] <- phylum
} else if (tax.clean[i,4] == ""){
class <- paste("Class_", tax.clean[i,3], sep = "")
tax.clean[i, 4:7] <- class
} else if (tax.clean[i,5] == ""){
order <- paste("Order_", tax.clean[i,4], sep = "")
tax.clean[i, 5:7] <- order
} else if (tax.clean[i,6] == ""){
family <- paste("Family_", tax.clean[i,5], sep = "")
tax.clean[i, 6:7] <- family
} else if (tax.clean[i,7] == ""){
tax.clean$Species[i] <- paste("Genus_",tax.clean$Genus[i], sep = "_")
}
}
rm(bad, class, family, i, kingdom,new,order,phylum,uncul)
tax_table(all) <- as.matrix(tax.clean)
all
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 7521 taxa and 168 samples ]
## sample_data() Sample Data: [ 168 samples by 12 sample variables ]
## tax_table() Taxonomy Table: [ 7521 taxa by 7 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 7521 tips and 7465 internal nodes ]
## refseq() DNAStringSet: [ 7521 reference sequences ]
#Remove Unnassigned
#all.clean <- subset_taxa(all, Kingdom != "Archaea")
#all.clean <- subset_taxa(all.clean, Kingdom != "Eukaryota")
all.clean <- subset_taxa(all, Kingdom != "Unassigned")
all.clean <- subset_taxa(all.clean, Order != "Chloroplast")
all.clean <- prune_taxa(taxa_sums(all.clean) > 0, all.clean)
all.clean
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 7027 taxa and 168 samples ]
## sample_data() Sample Data: [ 168 samples by 12 sample variables ]
## tax_table() Taxonomy Table: [ 7027 taxa by 7 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 7027 tips and 6979 internal nodes ]
## refseq() DNAStringSet: [ 7027 reference sequences ]
p <- ggrare(all.clean, step = 1000,
color = "New_Diet",
label = "sampleType", se = F,
parallel = TRUE,
plot = FALSE)
cols <- c(brewer.pal(8,"Set1"), brewer.pal(7,"Dark2"),brewer.pal(7,"Set2"),brewer.pal(12,"Set3"),brewer.pal(7,"Accent"),brewer.pal(12,"Paired"),"gray")
p <- p + theme_bw() +
scale_fill_manual(values =cols) +
scale_colour_manual( values = cols) +
facet_wrap(~New_Diet)
p
shannon.div <- estimate_richness(all.clean, measures = c("Shannon", "Simpson", "Observed","Chao1"))
sampledata1<- data.frame(sample_data(all.clean))
row.names(shannon.div) <- gsub("[.]","-", row.names(shannon.div))
sampleData <- merge(sampledata1, shannon.div, by = 0 , all = TRUE)
sampleData$New_Diet <- factor(sampleData$New_Diet, levels=c('CTR','ext-ctrl', 'MC1', 'MC2', 'MN3'))
levels(sampleData$New_Diet)
## [1] "CTR" "ext-ctrl" "MC1" "MC2" "MN3"
my_comparisons <- list( c("ext-ctrl", "MC1"), c("ext-ctrl", "MC2"), c("ext-ctrl", "MN3"),
c("CTR", "MC1"), c("CTR", "MC2"),
c("CTR", "MN3"))
p1 <- ggboxplot(sampleData, x = "New_Diet", y = "Observed",
color = "New_Diet", palette = "jco", legend = "none")+
stat_compare_means(comparisons = my_comparisons)+ # Add pairwise comparisons p-value
stat_compare_means(label.y = 400) +
geom_jitter(aes(colour = New_Diet), size = 2, alpha = 0.6) +
geom_boxplot(aes(fill = New_Diet), width=0.7, alpha = 0.5) +
theme_bw() + theme(legend.position="none",axis.title.x=element_blank()) + scale_fill_manual(values =cols) + scale_colour_manual( values = cols)
## [1] FALSE
set.seed(1)
PCoA_bray <- ordinate(physeq = all.clean, method = "PCoA", distance = "bray")
PCoA_bray_plot<- plot_ordination(
physeq = all.clean,
ordination = PCoA_bray,
color = "New_Diet"
) +
geom_point(shape = 19, alpha=0.7) + theme_bw() + ggtitle("PCoA Plot - Bray") +
xlab("PCoA 1 [17.4 %]") + ylab("PCoA 2 [8.8 %]") + stat_ellipse()+ scale_fill_manual(values =cols) + scale_colour_manual( values = cols)
bottom_row <- plot_grid(p1, PCoA_bray_plot, labels = c('B', 'C'), align = 'h', rel_widths = c(1, 1.3))
plot_grid(p, bottom_row, labels = c('A', ''), ncol = 1, rel_heights = c(1, 1.2))
Computes the Bray-Curtis distance between all the samples in the dataset “all.clean” and tests for differences between samples in the variable “New_Diet” using the Adonis2 function. The Adonis2 function was run with 9999 permutations.
sampledf <- data.frame(sample_data(all.clean))
bcdist <- phyloseq::distance(all.clean, method="bray",normalized=TRUE)
adonis2(bcdist ~ New_Diet,
data = sampledf, permutations = 9999)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 9999
##
## adonis2(formula = bcdist ~ New_Diet, data = sampledf, permutations = 9999)
## Df SumOfSqs R2 F Pr(>F)
## New_Diet 4 3.390 0.0667 2.9122 0.0001 ***
## Residual 163 47.432 0.9333
## Total 167 50.822 1.0000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df <- as.data.frame(sample_data(all.clean)) # Put sample_data into a ggplot-friendly data.frame
df$Sample_or_Control <- ifelse( df$New_Diet %in% c("ext-ctrl"), "Control_Sample", "True_Sample")
sample_data(all.clean) <- df
df$LibrarySize <- sample_sums(all.clean)
df <- df[order(df$LibrarySize),]
df$Index <- seq(nrow(df))
#ggplot(data=df, aes(x=Index, y=LibrarySize, color=Sample_or_Control)) + geom_point() + theme_bw()
# Identify Contaminants - Prevalence
sample_data(all.clean)$is.neg <- sample_data(all.clean)$Sample_or_Control == "Control_Sample"
#more aggressive classification threshold rather than the default. i.e., 0.05
contamdf.prev05 <- isContaminant(all.clean, method="prevalence", neg="is.neg", threshold=0.5)
paste("The number of contamination found is",table(contamdf.prev05$contaminant)[2])
## [1] "The number of contamination found is 91"
all.noncontam <- prune_taxa(!contamdf.prev05$contaminant, all.clean)
set.seed(1)
PCoA_bray <- ordinate(physeq = all.noncontam, method = "PCoA", distance = "bray")
PCoA_bray_plot<- plot_ordination(
physeq = all.noncontam,
ordination = PCoA_bray,
color = "New_Diet"
) +
geom_point(shape = 19, alpha=0.7) + theme_bw() + ggtitle("PCoA Plot - Bray") +
xlab("PCoA 1 [17.6 %]") + ylab("PCoA 2 [9 %]") + stat_ellipse()
psdata <- subset_samples(all.noncontam, Sample_or_Control=="True_Sample")
psdata <- prune_taxa(taxa_sums(psdata) > 0, psdata)
psdata
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 6615 taxa and 156 samples ]
## sample_data() Sample Data: [ 156 samples by 14 sample variables ]
## tax_table() Taxonomy Table: [ 6615 taxa by 7 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 6615 tips and 6573 internal nodes ]
## refseq() DNAStringSet: [ 6615 reference sequences ]
Inspect the number of reads per sample and compare to rarefaction curves
sample_data(psdata)$reads <- unlist(sample_sums(psdata))
dat1 <- data.frame(sample_data(psdata))
#table(dat1$reads)
# Set a cutoff where Shannon diversity dont increase, and observed increases markedly slower.
# At the same time don't throw too many samples out. Set the cutoff value accordingly
cutoff <- 0
# See which samples have been removed
#dat1[dat1$reads < cutoff,]
# Remove the samples with fewer reads than the cutoff
psdata.p <- prune_samples(sample_sums(psdata) > cutoff, psdata)
psdata.p <- subset_samples(psdata, samplingTime != "T0")
psdata.p <- prune_taxa(taxa_sums(psdata.p) >0, psdata.p)
psdata.p
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 6230 taxa and 144 samples ]
## sample_data() Sample Data: [ 144 samples by 15 sample variables ]
## tax_table() Taxonomy Table: [ 6230 taxa by 7 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 6230 tips and 6193 internal nodes ]
## refseq() DNAStringSet: [ 6230 reference sequences ]
psdata.r<- transform_sample_counts(psdata.p, function(x) x / sum(x) )
rm(all)
rm(all.clean)
rm(all.noncontam)
Remove Time point ‘T0’
psdata.r <- subset_samples(psdata.r, samplingTime != "T0")
psdata.r <- prune_taxa(taxa_sums(psdata.r) > 0, psdata.r)
Phylum level taxonomic distribution. Bars report the mean abundance for each individual sample.
#psdata.r<- transform_sample_counts(all.clean, function(x) x / sum(x) )
Final.RNA <- aggregate_rare(psdata.r, level = "Phylum", detection = 1/100, prevalence = 20/100)
getPalette = colorRampPalette(brewer.pal(10, "Dark2"))
PhylaPalette = getPalette(10)
Final.RNA_phylum_plot<- plot_composition(Final.RNA, sample.sort = "Proteobacteria",otu.sort = "abundance", verbose = TRUE)
Final.RNA_phylum_plot <- Final.RNA_phylum_plot +
theme_bw() +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
scale_fill_manual(values = PhylaPalette)
Final.RNA_phylum_plot
#Bacterial Community Composition for Manuscript
Final.seq.melt.RNA <- psmelt(tax_glom(psdata.r, "Species"))
tax_ranks <- c("Phylum", "Class", "Order", "Family", "Genus", "Species")
for (rank in tax_ranks) {
n_unique <- length(unique(Final.seq.melt.RNA[[rank]]))
message(paste(rank, ": ", n_unique, sep = ""))
}
## Phylum: 56
## Class: 123
## Order: 276
## Family: 468
## Genus: 1042
## Species: 1173
paste("number of unique Phylum is", table(grepl("Kingdom", unique(Final.seq.melt.RNA$Phylum)))[1])
## [1] "number of unique Phylum is 54"
paste("number of unique Class is", table(grepl("Kingdom|Phylum", unique(Final.seq.melt.RNA$Class)))[1])
## [1] "number of unique Class is 117"
paste("number of unique Order is", table(grepl("Kingdom|Class|Phylum", unique(Final.seq.melt.RNA$Order)))[1])
## [1] "number of unique Order is 252"
paste("number of unique Family is", table(grepl("Kingdom|Order|Class|Phylum", unique(Final.seq.melt.RNA$Family)))[1])
## [1] "number of unique Family is 406"
paste("number of unique Genus is", table(grepl("Kingdom|Family|Order|Class|Phylum", unique(Final.seq.melt.RNA$Genus)))[1])
## [1] "number of unique Genus is 842"
paste("number of unique Species is", table(grepl("Kingdom|Family|Order|Class|Phylum|Genus", unique(Final.seq.melt.RNA$Species)))[1])
## [1] "number of unique Species is 415"
Supplementary table for the manuscript
Phylum_df <- summaryBy(Abundance~Phylum, data=Final.seq.melt.RNA, FUN=sum)
Phylum_df$Percent <- round(Phylum_df$Abundance.sum/sum(Phylum_df$Abundance.sum)*100, 4)
Phylum_df <- plyr::arrange(Phylum_df, plyr::desc(Percent))
Phylum_df$PercentageRound <- round(Phylum_df$Percent, digits = 2)
DT::datatable(Phylum_df)
order_df <- summaryBy(Abundance~Order, data=Final.seq.melt.RNA, FUN=sum)
order_df$Percent <- round(order_df$Abundance.sum/sum(order_df$Abundance.sum)*100, 4)
order_df <- plyr::arrange(order_df, plyr::desc(Percent))
order_df$PercentageRound <- round(order_df$Percent, digits = 2)
DT::datatable(order_df)
genus_df <- summaryBy(Abundance~Genus, data=Final.seq.melt.RNA, FUN=sum)
genus_df$Percent <- round(genus_df$Abundance.sum/sum(genus_df$Abundance.sum)*100, 4)
genus_df <- plyr::arrange(genus_df, plyr::desc(Percent))
genus_df$Round <- round(genus_df$Percent, digits = 2)
DT::datatable(genus_df)
shannon.div <- estimate_richness(psdata.p, measures = c("Shannon", "Observed"))
sampledata1<- data.frame(sample_data(psdata.p))
row.names(shannon.div) <- gsub("[.]","-", row.names(shannon.div))
sampleData <- merge(sampledata1, shannon.div, by = 0 , all = TRUE)
sampleData$New_Diet <- factor(sampleData$New_Diet, levels=c( 'CTR', 'MC1', 'MC2', 'MN3'))
sampleData$samplingTime <- factor(sampleData$samplingTime, levels=c('T1', 'T2', 'T3'))
my_comparisons <- list( c("CTR", "MC1"), c("CTR", "MC2"), c("CTR", "MN3"),
c("MC1", "MC2"), c("MC1", "MN3"), c("MC2", "MN3") )
p2 <- ggboxplot(sampleData, x = "New_Diet", y = "Shannon",
color = "New_Diet", palette = "jco", legend = "none") +
stat_compare_means(comparisons = my_comparisons) +
stat_compare_means(label.y = 6.5) +
geom_jitter(aes(colour = New_Diet), size = 2, alpha = 0.6) +
geom_boxplot(aes(fill = New_Diet), width=0.7, alpha = 0.5) +
theme_bw() + theme(legend.position="none",axis.title.x=element_blank()) +
scale_fill_manual(values = cols) +
scale_colour_manual( values = cols) +
facet_wrap("samplingTime")
## [1] FALSE
my_comparisons <- list(c("T1", "T2"), c("T1", "T3"), c("T2", "T3") )
p3 <- ggboxplot(sampleData, x = "samplingTime", y = "Shannon",
color = "samplingTime", palette = "jco", legend = "none")+
stat_compare_means(comparisons = my_comparisons)+ # Add pairwise comparisons p-value
stat_compare_means(label.y = 5.5) +
geom_jitter(aes(colour = samplingTime), size = 2, alpha = 0.6) +
geom_boxplot(aes(fill = samplingTime), width=0.7, alpha = 0.5) +
theme_bw() +
theme(legend.position="none",axis.title.x=element_blank()) +
scale_fill_manual(values = cols) +
scale_colour_manual( values = cols) +
facet_wrap(. ~ New_Diet, ncol = 4)
## [1] FALSE
# plot_grid(p2, p3, labels = c('A', 'B'), label_size = 12, ncol = 2)
set.seed(1)
PCoA_bray <- ordinate(physeq = psdata.p, method = "PCoA", distance = "bray")
p4 <- plot_ordination(
physeq = psdata.p,
ordination = PCoA_bray,
color = "samplingTime"
) +
geom_point(shape = 19, alpha=0.7) +
theme_bw() + ggtitle("PCoA Plot - Bray") +
xlab("PCoA 1 [22.8 %]") + ylab("PCoA 2 [7.2 %]") +
stat_ellipse() + scale_fill_manual(values =cols) +
scale_colour_manual( values = cols)
# Run adonis test
sampledf <- data.frame(sample_data(psdata.p))
bcdist <- phyloseq::distance(psdata.p, method="bray",normalized=TRUE)
result <- adonis2(bcdist ~ samplingTime, data = sampledf, permutations = 9999)
p4 <- p4 + annotate(
"text", x = -0.45, y = 0.28,
label = paste("Adonis R2 =", round(result$R2[1], 3),
"\np-value =", result$`Pr(>F)`[1]),
col = "black", fontface = "bold")
set.seed(1)
p5 <- plot_ordination(
physeq = psdata.p,
ordination = PCoA_bray,
color = "New_Diet"
) +
geom_point(shape = 19, alpha=0.7) +
theme_bw() + ggtitle("PCoA Plot - Bray") +
xlab("PCoA 1 [22.8 %]") + ylab("PCoA 2 [7.2 %]") +
stat_ellipse() + scale_fill_manual(values =cols) +
scale_colour_manual( values = cols)
# Run adonis test
sampledf <- data.frame(sample_data(psdata.p))
result <- adonis2(bcdist ~ New_Diet, data = sampledf, permutations = 9999)
p5 <- p5 + annotate(
"text", x = -0.47, y = 0.3,
label = paste("Adonis R2 =", round(result$R2[1], 3),
"\np-value =", result$`Pr(>F)`[1]),
col = "black", fontface = "bold")
p6 <- plot_grid(p2, p3, labels = c('A', 'B'), label_size = 12, ncol = 1, align = "hv")
p7 <- plot_grid(p4, p5, labels = c('C', 'D'), label_size = 12, ncol = 1, align = "hv")
plot_grid(p6, p7, ncol = 2, rel_widths = c(1.5, 1))
Heatmap to summarize taxonomy at different time points
ampvis2_obj <- phyloseq_to_ampvis2(psdata.p)
amp_heatmap(ampvis2_obj,
group_by = "New_Diet",
facet_by = "samplingTime",
tax_aggregate = "Genus",
tax_add = c("Kingdom", "Phylum"),
tax_show = 50,
color_vector = c("white", "dark red"),
plot_colorscale = "sqrt",
plot_values = T) +
theme(axis.text.x = element_text(angle = 45, size=10, vjust = 1),
axis.text.y = element_text(size=8),
legend.position="right")
rabuplot(phylo_ob = psdata.r, predictor= "New_Diet", type = "Genus", facet_wrap ="samplingTime", N_taxa = 20)
Set seed as a precaution for reproducibility as some methods are non-deterministic.
set.seed(13118)
prefix <- "m"
save_TOM <- TRUE
pam_stage <- FALSE # Partitioning around medoids, tries to put more OTUs into modules where it is not directly clear from the dendrogram cutting.
set_verbose = 1 # How much detail from the WGCNA functions? Higher means more detail.
omics_type = "otu"
plot_labeling_size = 15
take_average <- TRUE
parameter_sets <- list(set_1 = list(applied_norm = "TSS", applied_transf = "CLR", assoc_measure = "bicor"),
set_2 = list(applied_norm = "CSS", applied_transf = "log2", assoc_measure = "bicor"))
chosen_parameter_set <- parameter_sets$set_2
psdata.r1 <- filter_taxa(psdata.p, function(x) sum(x > 4) > (0.01*length(x)), TRUE)# Remove taxa not seen more than 3 times in at least 5% of samples
psdata.r1 <- transform_sample_counts(psdata.r1, function(x) x / sum(x) )
#host_gut_mapping_file <- read.table("../../../host_data/host_gut_mapping.txt", sep = "\t", row.names = 1, header = T)
# host_gut_rawCounts_total1 <- read.table("../host_data/host_gut_count.txt")
#host_gut_rawCounts_total1 <- omics_data_host
gut_16SrRNA_mapping_file <- sample_data(psdata.r1)
#gut_16SrRNA_mapping_file$New_Diet <- tolower(gut_16SrRNA_mapping_file$New_Diet)
gut_16SrRNA_mapping_file$ID_New <- paste(gut_16SrRNA_mapping_file$samplingTime, "_",
gut_16SrRNA_mapping_file$New_Diet, "_",
gut_16SrRNA_mapping_file$samplingTank, "_", "0",
gut_16SrRNA_mapping_file$sampleNumber,
sep = "")
sample_info <- read.csv("/Users/shashankgupta/Desktop/ImprovAFish/ImprovAFish/sample_info.csv", row.names = 1)
host_gut_mapping_file <- sample_info
host_gut_mapping_file <- host_gut_mapping_file[host_gut_mapping_file$ID_New %in% gut_16SrRNA_mapping_file$ID_New, ]
gut_16SrRNA_mapping_file <- gut_16SrRNA_mapping_file[gut_16SrRNA_mapping_file$ID_New %in% host_gut_mapping_file$ID_New, ]
gut_16SrRNA_mapping_file$rownames <- row.names(gut_16SrRNA_mapping_file)
row.names(gut_16SrRNA_mapping_file) <- gut_16SrRNA_mapping_file$ID_New
omics_data <- otu_table(psdata.r1)
ID_New_name_list <- gut_16SrRNA_mapping_file$rownames
gut_16SrRNA_OTU_table <- omics_data[, colnames(omics_data) %in% ID_New_name_list]
table(colnames(gut_16SrRNA_OTU_table) == gut_16SrRNA_mapping_file$rownames)
##
## TRUE
## 142
colnames(gut_16SrRNA_OTU_table) <- gut_16SrRNA_mapping_file$ID_New
table(colnames(gut_16SrRNA_OTU_table) == gut_16SrRNA_mapping_file$ID_New)
##
## TRUE
## 142
omics_data <- gut_16SrRNA_OTU_table
omics_data <- t(omics_data)
dim(omics_data) %>% paste(c("Samples", "OTUs"))
HellingerData<-decostand(omics_data,method = "hellinger")
omics_data <- HellingerData
dim(omics_data) %>% paste(c("Samples", "OTUs"))
powers <- c(1:10, seq(12,20,2))
suppressWarnings(sft <- pickSoftThreshold(omics_data,
powerVector = powers,
verbose = set_verbose,
networkType = "signed",
corFn= chosen_parameter_set$assoc_measure))
# Find the soft thresholding power beta to which co-expression similarity is raised to calculate adjacency.
# based on the criterion of approximate scale-free topology.
idx <- min(which((-sign(sft$fitIndices[,3])*sft$fitIndices[,2]) > 0.90))
if(is.infinite(idx)){
idx <- min(which((-sign(sft$fitIndices[,3])*sft$fitIndices[,2]) > 0.80))
if(!is.infinite(idx)){
st <- sft$fitIndices[idx,1]
} else{
idx <- which.max(-sign(sft$fitIndices[,3])*sft$fitIndices[,2])
st <- sft$fitIndices[idx,1]
}
} else{
st <- sft$fitIndices[idx,1]
}
# Plot Scale independence measure and Mean connectivity measure
# Scale-free topology fit index as a function of the soft-thresholding power
data.frame(Indices = sft$fitIndices[,1],
sfApprox = -sign(sft$fitIndices[,3])*sft$fitIndices[,2]) %>%
ggplot() +
geom_hline(yintercept = 0.9, color = "black", alpha = 0.6) + # corresponds to R^2 cut-off of 0.9
geom_hline(yintercept = 0.8, color = "black", alpha = 0.2) + # corresponds to R^2 cut-off of 0.8
geom_line(aes(x = Indices, y = sfApprox), color = "red", alpha = 0.1, size = 2.5) +
geom_text(mapping = aes(x = Indices, y = sfApprox, label = Indices), color = "red", size = 4) +
ggtitle("Scale independence") +
xlab("Soft Threshold (power)") +
ylab("SF Model Fit,signed R^2") + theme_bw() +
xlim(1,20) +
ylim(-1,1) +
geom_segment(aes(x = st, y = 0.25, xend = st, yend = sfApprox[idx]-0.05),
arrow = arrow(length = unit(0.2,"cm")),
size = 0.5)-> scale_independence_plot
# Mean connectivity as a function of the soft-thresholding power
data.frame(Indices = sft$fitIndices[,1],
meanApprox = sft$fitIndices[,5]) %>%
ggplot() +
geom_line(aes(x = Indices, y = meanApprox), color = "red", alpha = 0.1, size = 2.5) +
geom_text(mapping = aes(x = Indices, y = meanApprox, label = Indices), color = "red", size = 4) +
xlab("Soft Threshold (power)") +
ylab("Mean Connectivity") + theme_bw() +
geom_segment(aes(x = st-0.4,
y = sft$fitIndices$mean.k.[idx],
xend = 0,
yend = sft$fitIndices$mean.k.[idx]),
arrow = arrow(length = unit(0.2,"cm")),
size = 0.4) +
ggtitle(paste0("Mean connectivity: ",
round(sft$fitIndices$mean.k.[idx],2))) -> mean_connectivity_plot
cowplot::plot_grid(scale_independence_plot, mean_connectivity_plot, ncol = 2, align = "h", labels = c("A", "B"), label_size = plot_labeling_size) -> si_mc_plot
si_mc_plot
allowWGCNAThreads()
modules.omics.Y <- blockwiseModules(omics_data,
power = st,
networkType = "signed",
TOMType = "signed",
corType = chosen_parameter_set$assoc_measure,
#maxPOutliers = 0.05,
#deepSplit = 4, # Default 2
minModuleSize = 12, # 30
#minCoreKME = 0.5, # Default 0.5
#minCoreKMESize = 2, # Default minModuleSize/3,
#minKMEtoStay = 0.5, # Default 0.3
#reassignThreshold = 0, # Default 1e-6
#mergeCutHeight = 0.4, # Default 0.15
#pamStage = pam_stage,
#pamRespectsDendro = TRUE,
#replaceMissingAdjacencies = TRUE,
numericLabels = TRUE,
saveTOMs = save_TOM,
saveTOMFileBase = "TOM",
verbose = 3,
maxBlockSize=8000,
nThreads = 10)
moduleLabels <- modules.omics.Y$colors
moduleColors <- labels2colors(modules.omics.Y$colors)
hubs <- chooseTopHubInEachModule(omics_data, modules.omics.Y$colors, power = st, omitColors = "0")
stage2results_Y <- list(modules = modules.omics.Y,
hubs = hubs)
saveRDS(stage2results_Y, "/Users/shashankgupta/Desktop/ImprovAFish/github/ImprovaFish/OmicsIntegration/stage2results_Metagenomics.rds")
Convert labels to colors for plotting and check how many ASVs are there in each modules
merged_colors <- labels2colors(stage2results_Y$modules$colors)
n_modules <- unique(merged_colors) %>% length()
samples_good <- sum(stage2results_Y$modules$goodSamples) == length(stage2results_Y$modules$goodSamples)
genes_good <- sum(stage2results_Y$modules$goodGenes) == length(stage2results_Y$modules$goodGenes)
ME_good <- sum(stage2results_Y$modules$MEsOK) == length(stage2results_Y$modules$MEsOK)
table(stage2results_Y$modules$colors) %>%
as.data.frame() %>%
dplyr::rename(Module = Var1, Size = Freq) %>%
dplyr::mutate(Module_color = labels2colors(as.numeric(as.character(Module)))) -> module_size
module_size %>%
ggplot(aes(x = Module, y = Size, fill = Module)) +
geom_col(color = "#000000") +
ggtitle("Number of genes in each module") +
theme(legend.position = "none") + theme_bw() +
scale_fill_manual(values = setNames(module_size$Module_color,module_size$Module)) +
geom_text(aes(label = Size),vjust = 0.5, hjust = -0.18, size = 3.5) +
ylim(0, max(module_size$Size)*1.1) +
theme(plot.margin = margin(2, 2, 2, 2, "pt")) +
coord_flip()-> module_size_barplot
Module correlation to other modules
all(rownames(omics_data) == rownames(MEs))
dim(omics_data) %>% paste0(c(" samples", " OTUs"))
kME <- bicor(omics_data, MEs, maxPOutliers = 0.05)
dim(kME) %>% paste0(c(" OTUs", " modules"))
intra_cor <- c()
for (i in 1:ncol(omics_data)) {
m <- stage2results_Y$modules$colors[i]
intra_cor[i] <- kME[i, paste0("ME", m)]
if(m != 0){
intra_cor[i] <- kME[i, paste0("ME", m)]
} else{
intra_cor[i] <- NA
}
}
idx <- which(is.na(intra_cor))
intra_cor <- intra_cor[-idx]
# Corr within modules
corr_within_module <- function(omics_data, modules, module_x = 1){
idx.omics_data <- which(modules$colors == module_x)
idx.me <- which(colnames(modules$MEs) == paste0("ME",module_x))
kME_x <- bicor(omics_data[,idx.omics_data], modules$MEs[,idx.me], maxPOutliers = 0.05)
kME_x
}
ggplot.list <- list()
for(m in colnames(stage2results_Y$modules$MEs))
{
h <- as.numeric(sub("ME","", m))
data.frame(x = suppressWarnings(corr_within_module(omics_data = omics_data, modules = stage2results_Y$modules, module_x = h))) %>%
ggplot() +
#geom_density(aes(x = x), fill = labels2colors(h), color = "black", alpha = 0.5) +
geom_histogram(aes(x), fill = labels2colors(h), color = "black", alpha = 0.5, bins = 20) +
xlim(-1, 1) +
xlab("ASV correlation")+ theme_bw() +
ggtitle(paste0(prefix,m)) -> da_plot
ggplot.list[[m]] <- da_plot
}
ggplot.list <- ggplot.list[ggplot.list %>% names() %>% sub("ME", "", .) %>% as.numeric() %>% order()]
cowplot::plot_grid(plotlist = ggplot.list, ncol = 6) -> density_all_plot
# comine to one plot
cowplot::plot_grid(si_mc_plot , density_eigen, ncol = 1, rel_heights = c(0.8,1)) -> part_1
cowplot::plot_grid(part_1, module_size_barplot, labels = c("", "C"), label_size = plot_labeling_size, rel_widths = c(1,0.5)) -> part_2
cowplot::plot_grid(part_2, density_all_plot, ncol = 1, rel_heights = c(0.8,1), labels = c("", "F"), label_size = plot_labeling_size)