Step 1. Metagenomics

Load all the packages used in the analysis

library("ranacapa")
library("phyloseq")
library("ggplot2")
library("stringr")
library("plyr")
library("reshape2")
library("reshape")
library("dplyr")
library("tidyr")
library("doBy")
library("plyr")
library("microbiome")
library("ggpubr")
library("vegan")
library("tidyverse")
library("magrittr")
library("cowplot")
library("dendextend")
library("WGCNA")
library("metagenomeSeq")
library("decontam")
library("RColorBrewer")
library("ampvis2")
library("ggpubr")
library("formatR")
library("DT")

Import data and clean the taxonomy

setwd("/Users/shashankgupta/Desktop/ImprovAFish/github/ImprovaFish/Metagenomics")
raw <- import_biom("/Users/shashankgupta/Desktop/ImprovAFish/exported-feature-table/feature-table_taxonomy.biom")
tree <- read_tree("/Users/shashankgupta/Desktop/ImprovAFish/exported-feature-table/tree.nwk")
refseq <- Biostrings::readDNAStringSet("/Users/shashankgupta/Desktop/ImprovAFish/exported-feature-table/dna-sequences.fasta", use.names = TRUE)
dat <- read.table("/Users/shashankgupta/Desktop/ImprovAFish/metadata.txt", header = TRUE,row.names = 1, sep = "\t")
# Merge into one complete phyloseq object
all <- merge_phyloseq(raw, sample_data(dat), tree, refseq)
tax <- data.frame(tax_table(all), stringsAsFactors = FALSE)
tax <- tax[,1:7] # No info in col 8-15
# Set informative colnames
colnames(tax) <- c("Kingdom", "Phylum","Class","Order","Family","Genus", "Species")
tax.clean <- data.frame(row.names = row.names(tax),
                        Kingdom = str_replace(tax[,1], "d__",""), 
                        Phylum = str_replace(tax[,2], "p__",""),
                        Class = str_replace(tax[,3], "c__",""),
                        Order = str_replace(tax[,4], "o__",""),
                        Family = str_replace(tax[,5], "f__",""),
                        Genus = str_replace(tax[,6], "g__",""),
                        Species = str_replace(tax[,7], "s__",""), 
                        stringsAsFactors = FALSE)
tax.clean[is.na(tax.clean)] <- ""
# - Clean rank by rank
# Kingdom - Remove the unassigned completely
# Phylum
# Class
# Remove extra info about origin from some bacteria
# Remove all fields that contain "uncultured", "Unknown" or "Ambigious"
bad <- c("Ambiguous_taxa","uncultured", "Subgroup_21")
tax.clean[tax.clean$Class %in% bad,3:7] <- ""

# Order
bad <- c("0319-6G20","1-20","11-24", "ADurb.Bin180","D8A-2", "Group_1.1c", "JGI_0000069-P22","Marine_Group_II",
         "Pla3_lineage","Run-SP154",
         "Ambiguous_taxa", "uncultured", "UBA10353_marine_group",
         "Subgroup_17", "SAR86_clade", "SAR11_clade", "SAR202_clade")
tax.clean[tax.clean$Order %in% bad,4:7] <- ""

# Family
bad <- c("Ambiguous_taxa","11-24","67-14", "uncultured", "SAR116_clade", "Run-SP154", 
         "Marine_Group_II", "env.OPS_17", "SAR116_clade", "S085", "S-70", "NS9_marine_group", "Mitochondria")
tax.clean[tax.clean$Family %in% bad,5:7] <- ""

# Genus
bad <- c("Ambiguous_taxa","Unknown_Family","uncultured","Subgroup_10", "1174-901-12", "67-14")
tax.clean[tax.clean$Genus %in% bad,6:7] <- ""

# Species
bad <- c("Ambiguous_taxa","marine_metagenome","low_GC","wastewater_metagenome","unidentified", 
         "uncultured_synthetic", "uncultured_organism")
tax.clean[tax.clean$Species %in% bad,6:7] <- ""

#tax.clean[grepl("uncultured", tax.clean$Species),"Species"] <- ""
#tax.clean[grepl("unidentified", tax.clean$Species),"Species"] <- ""

# Remove remove ".", change "-" and " " to "_"
for (i in 1:ncol(tax.clean)){
  tax.clean[,i] <- str_replace_all(tax.clean[,i], "[.]","")
  tax.clean[,i] <- str_replace_all(tax.clean[,i], "[(]","")
  tax.clean[,i] <- str_replace_all(tax.clean[,i], "[)]","")
  tax.clean[,i] <- str_replace_all(tax.clean[,i], "-","_")
  tax.clean[,i] <- str_replace_all(tax.clean[,i], " ","_")
}

for (i in 1:7){ tax.clean[,i] <- as.character(tax.clean[,i])}
# File holes in the tax table
for (i in 1:nrow(tax.clean)){
  #  Fill in missing taxonomy
  if (tax.clean[i,2] == ""){
    kingdom <- paste("Kingdom_", tax.clean[i,1], sep = "")
    tax.clean[i, 2:7] <- kingdom
  } else if (tax.clean[i,3] == ""){
    phylum <- paste("Phylum_", tax.clean[i,2], sep = "")
    tax.clean[i, 3:7] <- phylum
  } else if (tax.clean[i,4] == ""){
    class <- paste("Class_", tax.clean[i,3], sep = "")
    tax.clean[i, 4:7] <- class
  } else if (tax.clean[i,5] == ""){
    order <- paste("Order_", tax.clean[i,4], sep = "")
    tax.clean[i, 5:7] <- order
  } else if (tax.clean[i,6] == ""){
    family <- paste("Family_", tax.clean[i,5], sep = "")
    tax.clean[i, 6:7] <- family
  } else if (tax.clean[i,7] == ""){
    tax.clean$Species[i] <- paste("Genus_",tax.clean$Genus[i], sep = "_")
  }
}

rm(bad, class, family, i, kingdom,new,order,phylum,uncul)

tax_table(all) <- as.matrix(tax.clean)
all
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 7521 taxa and 168 samples ]
## sample_data() Sample Data:       [ 168 samples by 12 sample variables ]
## tax_table()   Taxonomy Table:    [ 7521 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 7521 tips and 7465 internal nodes ]
## refseq()      DNAStringSet:      [ 7521 reference sequences ]
#Remove Unnassigned
#all.clean <- subset_taxa(all, Kingdom != "Archaea")
#all.clean <- subset_taxa(all.clean, Kingdom != "Eukaryota")
all.clean <- subset_taxa(all, Kingdom != "Unassigned")
all.clean <- subset_taxa(all.clean, Order != "Chloroplast")

all.clean <- prune_taxa(taxa_sums(all.clean) > 0, all.clean)
all.clean
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 7027 taxa and 168 samples ]
## sample_data() Sample Data:       [ 168 samples by 12 sample variables ]
## tax_table()   Taxonomy Table:    [ 7027 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 7027 tips and 6979 internal nodes ]
## refseq()      DNAStringSet:      [ 7027 reference sequences ]

Rarefaction plot

p <- ggrare(all.clean, step = 1000, 
            color = "New_Diet", 
            label = "sampleType", se = F,
            parallel = TRUE,
            plot = FALSE)
cols  <- c(brewer.pal(8,"Set1"), brewer.pal(7,"Dark2"),brewer.pal(7,"Set2"),brewer.pal(12,"Set3"),brewer.pal(7,"Accent"),brewer.pal(12,"Paired"),"gray")

p <- p + theme_bw() + 
  scale_fill_manual(values =cols) +
  scale_colour_manual( values = cols) +
  facet_wrap(~New_Diet)
p

Alpha and Beta diversity

shannon.div <- estimate_richness(all.clean, measures = c("Shannon", "Simpson", "Observed","Chao1"))
sampledata1<- data.frame(sample_data(all.clean))
row.names(shannon.div) <- gsub("[.]","-", row.names(shannon.div))
sampleData <- merge(sampledata1, shannon.div, by = 0 , all = TRUE)
sampleData$New_Diet <- factor(sampleData$New_Diet, levels=c('CTR','ext-ctrl', 'MC1', 'MC2', 'MN3'))
levels(sampleData$New_Diet)
## [1] "CTR"      "ext-ctrl" "MC1"      "MC2"      "MN3"
my_comparisons <- list( c("ext-ctrl", "MC1"), c("ext-ctrl", "MC2"), c("ext-ctrl", "MN3"),
                        c("CTR", "MC1"), c("CTR", "MC2"),
                        c("CTR", "MN3"))
p1 <- ggboxplot(sampleData, x = "New_Diet", y = "Observed",
                color = "New_Diet", palette = "jco", legend = "none")+ 
  stat_compare_means(comparisons = my_comparisons)+ # Add pairwise comparisons p-value
  stat_compare_means(label.y = 400) +
  geom_jitter(aes(colour = New_Diet), size = 2, alpha = 0.6) +
  geom_boxplot(aes(fill = New_Diet), width=0.7, alpha = 0.5) +
  theme_bw() +  theme(legend.position="none",axis.title.x=element_blank())    +   scale_fill_manual(values =cols) + scale_colour_manual( values = cols)
## [1] FALSE
set.seed(1)
PCoA_bray <- ordinate(physeq = all.clean, method = "PCoA", distance = "bray")
PCoA_bray_plot<- plot_ordination(
  physeq = all.clean, 
  ordination = PCoA_bray, 
  color = "New_Diet"
) + 
  geom_point(shape = 19, alpha=0.7) + theme_bw() + ggtitle("PCoA Plot - Bray") + 
  xlab("PCoA 1 [17.4 %]") + ylab("PCoA 2 [8.8 %]") + stat_ellipse()+   scale_fill_manual(values =cols) + scale_colour_manual( values = cols)


bottom_row <- plot_grid(p1, PCoA_bray_plot, labels = c('B', 'C'), align = 'h', rel_widths = c(1, 1.3))
plot_grid(p, bottom_row, labels = c('A', ''), ncol = 1, rel_heights = c(1, 1.2))

Computes the Bray-Curtis distance between all the samples in the dataset “all.clean” and tests for differences between samples in the variable “New_Diet” using the Adonis2 function. The Adonis2 function was run with 9999 permutations.

sampledf <- data.frame(sample_data(all.clean))
bcdist <- phyloseq::distance(all.clean, method="bray",normalized=TRUE) 
adonis2(bcdist ~ New_Diet, 
        data = sampledf, permutations = 9999)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 9999
## 
## adonis2(formula = bcdist ~ New_Diet, data = sampledf, permutations = 9999)
##           Df SumOfSqs     R2      F Pr(>F)    
## New_Diet   4    3.390 0.0667 2.9122 0.0001 ***
## Residual 163   47.432 0.9333                  
## Total    167   50.822 1.0000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Contamination removal

df <- as.data.frame(sample_data(all.clean)) # Put sample_data into a ggplot-friendly data.frame
df$Sample_or_Control <- ifelse( df$New_Diet  %in% c("ext-ctrl"), "Control_Sample", "True_Sample")
sample_data(all.clean) <- df
df$LibrarySize <- sample_sums(all.clean)
df <- df[order(df$LibrarySize),]
df$Index <- seq(nrow(df))
#ggplot(data=df, aes(x=Index, y=LibrarySize, color=Sample_or_Control)) + geom_point() + theme_bw()
# Identify Contaminants - Prevalence
sample_data(all.clean)$is.neg <- sample_data(all.clean)$Sample_or_Control == "Control_Sample"
#more aggressive classification threshold rather than the default. i.e., 0.05
contamdf.prev05 <- isContaminant(all.clean, method="prevalence", neg="is.neg", threshold=0.5)
paste("The number of contamination found is",table(contamdf.prev05$contaminant)[2])
## [1] "The number of contamination found is 91"
all.noncontam <- prune_taxa(!contamdf.prev05$contaminant, all.clean)
set.seed(1)
PCoA_bray <- ordinate(physeq = all.noncontam, method = "PCoA", distance = "bray")
PCoA_bray_plot<- plot_ordination(
  physeq = all.noncontam, 
  ordination = PCoA_bray, 
  color = "New_Diet"
) + 
  geom_point(shape = 19, alpha=0.7) + theme_bw() + ggtitle("PCoA Plot - Bray") + 
  xlab("PCoA 1 [17.6 %]") + ylab("PCoA 2 [9 %]") + stat_ellipse()

psdata <- subset_samples(all.noncontam, Sample_or_Control=="True_Sample")
psdata <- prune_taxa(taxa_sums(psdata) > 0, psdata)
psdata
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 6615 taxa and 156 samples ]
## sample_data() Sample Data:       [ 156 samples by 14 sample variables ]
## tax_table()   Taxonomy Table:    [ 6615 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 6615 tips and 6573 internal nodes ]
## refseq()      DNAStringSet:      [ 6615 reference sequences ]

Inspect the number of reads per sample and compare to rarefaction curves

sample_data(psdata)$reads <- unlist(sample_sums(psdata))
dat1 <- data.frame(sample_data(psdata))
#table(dat1$reads)

# Set a cutoff where Shannon diversity dont increase, and observed increases markedly slower.
# At the same time don't throw too many samples out. Set the cutoff value accordingly
cutoff <- 0

# See which samples have been removed
#dat1[dat1$reads < cutoff,]

# Remove the samples with fewer reads than the cutoff
psdata.p <- prune_samples(sample_sums(psdata) > cutoff, psdata)
psdata.p <- subset_samples(psdata, samplingTime != "T0")
psdata.p <- prune_taxa(taxa_sums(psdata.p) >0, psdata.p)
psdata.p
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 6230 taxa and 144 samples ]
## sample_data() Sample Data:       [ 144 samples by 15 sample variables ]
## tax_table()   Taxonomy Table:    [ 6230 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 6230 tips and 6193 internal nodes ]
## refseq()      DNAStringSet:      [ 6230 reference sequences ]
psdata.r<- transform_sample_counts(psdata.p, function(x) x / sum(x) )
rm(all)
rm(all.clean)
rm(all.noncontam)

Remove Time point ‘T0’

psdata.r <- subset_samples(psdata.r, samplingTime != "T0")
psdata.r <- prune_taxa(taxa_sums(psdata.r) > 0, psdata.r)

Taxonomic classification

Phylum level taxonomic distribution. Bars report the mean abundance for each individual sample.

#psdata.r<- transform_sample_counts(all.clean, function(x) x / sum(x) )
Final.RNA <- aggregate_rare(psdata.r, level = "Phylum", detection = 1/100, prevalence = 20/100)
getPalette = colorRampPalette(brewer.pal(10, "Dark2")) 
PhylaPalette = getPalette(10)

Final.RNA_phylum_plot<- plot_composition(Final.RNA, sample.sort = "Proteobacteria",otu.sort = "abundance", verbose = TRUE)
Final.RNA_phylum_plot <- Final.RNA_phylum_plot + 
  theme_bw() + 
  theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
  scale_fill_manual(values = PhylaPalette)

Final.RNA_phylum_plot

#Bacterial Community Composition for Manuscript
Final.seq.melt.RNA <- psmelt(tax_glom(psdata.r, "Species"))
tax_ranks <- c("Phylum", "Class", "Order", "Family", "Genus", "Species")

for (rank in tax_ranks) {
  n_unique <- length(unique(Final.seq.melt.RNA[[rank]]))
  message(paste(rank, ": ", n_unique, sep = ""))
}
## Phylum: 56
## Class: 123
## Order: 276
## Family: 468
## Genus: 1042
## Species: 1173
paste("number of unique Phylum is", table(grepl("Kingdom", unique(Final.seq.melt.RNA$Phylum)))[1])
## [1] "number of unique Phylum is 54"
paste("number of unique Class is", table(grepl("Kingdom|Phylum", unique(Final.seq.melt.RNA$Class)))[1])
## [1] "number of unique Class is 117"
paste("number of unique Order is", table(grepl("Kingdom|Class|Phylum", unique(Final.seq.melt.RNA$Order)))[1])
## [1] "number of unique Order is 252"
paste("number of unique Family is", table(grepl("Kingdom|Order|Class|Phylum", unique(Final.seq.melt.RNA$Family)))[1])
## [1] "number of unique Family is 406"
paste("number of unique Genus is", table(grepl("Kingdom|Family|Order|Class|Phylum", unique(Final.seq.melt.RNA$Genus)))[1])
## [1] "number of unique Genus is 842"
paste("number of unique Species is", table(grepl("Kingdom|Family|Order|Class|Phylum|Genus", unique(Final.seq.melt.RNA$Species)))[1])
## [1] "number of unique Species is 415"

Supplementary table for the manuscript

Phylum_df <- summaryBy(Abundance~Phylum, data=Final.seq.melt.RNA, FUN=sum)
Phylum_df$Percent <- round(Phylum_df$Abundance.sum/sum(Phylum_df$Abundance.sum)*100, 4)
Phylum_df <- plyr::arrange(Phylum_df, plyr::desc(Percent))
Phylum_df$PercentageRound <- round(Phylum_df$Percent, digits = 2)
DT::datatable(Phylum_df)
order_df <- summaryBy(Abundance~Order, data=Final.seq.melt.RNA, FUN=sum)
order_df$Percent <- round(order_df$Abundance.sum/sum(order_df$Abundance.sum)*100, 4)
order_df <- plyr::arrange(order_df, plyr::desc(Percent))
order_df$PercentageRound <- round(order_df$Percent, digits = 2)
DT::datatable(order_df)
genus_df <- summaryBy(Abundance~Genus, data=Final.seq.melt.RNA, FUN=sum)
genus_df$Percent <- round(genus_df$Abundance.sum/sum(genus_df$Abundance.sum)*100, 4)
genus_df <- plyr::arrange(genus_df, plyr::desc(Percent))
genus_df$Round <- round(genus_df$Percent, digits = 2)
DT::datatable(genus_df)

Alpha and Beta diversity

shannon.div <- estimate_richness(psdata.p, measures = c("Shannon", "Observed"))
sampledata1<- data.frame(sample_data(psdata.p))
row.names(shannon.div) <- gsub("[.]","-", row.names(shannon.div))
sampleData <- merge(sampledata1, shannon.div, by = 0 , all = TRUE)
sampleData$New_Diet <- factor(sampleData$New_Diet, levels=c( 'CTR', 'MC1', 'MC2', 'MN3'))
sampleData$samplingTime <- factor(sampleData$samplingTime, levels=c('T1', 'T2', 'T3'))

my_comparisons <- list( c("CTR", "MC1"), c("CTR", "MC2"), c("CTR", "MN3"), 
                        c("MC1", "MC2"), c("MC1", "MN3"), c("MC2", "MN3") )


p2 <- ggboxplot(sampleData, x = "New_Diet", y = "Shannon",
                color = "New_Diet", palette = "jco", legend = "none") + 
  stat_compare_means(comparisons = my_comparisons) +
  stat_compare_means(label.y = 6.5) +
  geom_jitter(aes(colour = New_Diet), size = 2, alpha = 0.6) +
  geom_boxplot(aes(fill = New_Diet), width=0.7, alpha = 0.5) +
  theme_bw() +  theme(legend.position="none",axis.title.x=element_blank()) +  
  scale_fill_manual(values = cols) + 
  scale_colour_manual( values = cols) + 
  facet_wrap("samplingTime")
## [1] FALSE
my_comparisons <- list(c("T1", "T2"), c("T1", "T3"), c("T2", "T3") )


p3 <- ggboxplot(sampleData, x = "samplingTime", y = "Shannon",
                color = "samplingTime", palette = "jco", legend = "none")+ 
  stat_compare_means(comparisons = my_comparisons)+ # Add pairwise comparisons p-value
  stat_compare_means(label.y = 5.5) +
  geom_jitter(aes(colour = samplingTime), size = 2, alpha = 0.6) +
  geom_boxplot(aes(fill = samplingTime), width=0.7, alpha = 0.5) +
  theme_bw() +  
  theme(legend.position="none",axis.title.x=element_blank()) +
  scale_fill_manual(values = cols) + 
  scale_colour_manual( values = cols) +
  facet_wrap(. ~ New_Diet, ncol = 4)
## [1] FALSE
# plot_grid(p2, p3, labels = c('A', 'B'), label_size = 12, ncol = 2)
set.seed(1)
PCoA_bray <- ordinate(physeq = psdata.p, method = "PCoA", distance = "bray")
p4 <- plot_ordination(
  physeq = psdata.p, 
  ordination = PCoA_bray, 
  color = "samplingTime"
) + 
  geom_point(shape = 19, alpha=0.7) + 
  theme_bw() + ggtitle("PCoA Plot - Bray") + 
  xlab("PCoA 1 [22.8 %]") + ylab("PCoA 2 [7.2 %]") + 
  stat_ellipse() + scale_fill_manual(values =cols) + 
  scale_colour_manual( values = cols)
# Run adonis test
sampledf <- data.frame(sample_data(psdata.p))
bcdist <- phyloseq::distance(psdata.p, method="bray",normalized=TRUE) 
result <- adonis2(bcdist ~ samplingTime, data = sampledf, permutations = 9999)
p4 <- p4 + annotate(
  "text", x = -0.45, y = 0.28, 
  label = paste("Adonis R2 =", round(result$R2[1], 3),
                "\np-value =", result$`Pr(>F)`[1]),
  col = "black", fontface = "bold")




set.seed(1)
p5 <- plot_ordination(
  physeq = psdata.p, 
  ordination = PCoA_bray, 
  color = "New_Diet"
) + 
  geom_point(shape = 19, alpha=0.7) + 
  theme_bw() + ggtitle("PCoA Plot - Bray") + 
  xlab("PCoA 1 [22.8 %]") + ylab("PCoA 2 [7.2 %]") + 
  stat_ellipse() + scale_fill_manual(values =cols) + 
  scale_colour_manual( values = cols)
# Run adonis test
sampledf <- data.frame(sample_data(psdata.p))
result <- adonis2(bcdist ~ New_Diet, data = sampledf, permutations = 9999)
p5 <- p5 + annotate(
  "text", x = -0.47, y = 0.3, 
  label = paste("Adonis R2 =", round(result$R2[1], 3),
                "\np-value =", result$`Pr(>F)`[1]),
  col = "black", fontface = "bold")

p6 <- plot_grid(p2, p3, labels = c('A', 'B'), label_size = 12, ncol = 1, align = "hv")
p7 <- plot_grid(p4, p5, labels = c('C', 'D'), label_size = 12, ncol = 1, align = "hv")
plot_grid(p6, p7, ncol = 2, rel_widths = c(1.5, 1))

Heatmap to summarize taxonomy at different time points

ampvis2_obj <- phyloseq_to_ampvis2(psdata.p)
amp_heatmap(ampvis2_obj,
            group_by = "New_Diet",
            facet_by = "samplingTime",
            tax_aggregate = "Genus",
            tax_add = c("Kingdom", "Phylum"),
            tax_show = 50,
            color_vector = c("white", "dark red"),
            plot_colorscale = "sqrt",
            plot_values = T) +
  theme(axis.text.x = element_text(angle = 45, size=10, vjust = 1),
        axis.text.y = element_text(size=8),
        legend.position="right")

Differential abundance analysis

rabuplot(phylo_ob = psdata.r, predictor= "New_Diet", type = "Genus", facet_wrap   ="samplingTime", N_taxa = 20)

Step2. Host-microbiome interactions

Set seed as a precaution for reproducibility as some methods are non-deterministic.

set.seed(13118)
prefix <- "m"
save_TOM <- TRUE
pam_stage <- FALSE    # Partitioning around medoids, tries to put more OTUs into modules where it is not directly clear from the dendrogram cutting.
set_verbose = 1 # How much detail from the WGCNA functions? Higher means more detail.
omics_type = "otu"
plot_labeling_size = 15
take_average <- TRUE
parameter_sets <- list(set_1 = list(applied_norm = "TSS", applied_transf = "CLR", assoc_measure = "bicor"),
                       set_2 = list(applied_norm = "CSS", applied_transf = "log2", assoc_measure = "bicor"))
chosen_parameter_set <- parameter_sets$set_2

Formatting the data

psdata.r1 <- filter_taxa(psdata.p, function(x) sum(x > 4) > (0.01*length(x)), TRUE)# Remove taxa not seen more than 3 times in at least 5% of samples

psdata.r1 <- transform_sample_counts(psdata.r1, function(x) x / sum(x) )

#host_gut_mapping_file <- read.table("../../../host_data/host_gut_mapping.txt", sep = "\t", row.names = 1, header = T)
# host_gut_rawCounts_total1 <- read.table("../host_data/host_gut_count.txt")
#host_gut_rawCounts_total1 <- omics_data_host

gut_16SrRNA_mapping_file <- sample_data(psdata.r1)
#gut_16SrRNA_mapping_file$New_Diet <- tolower(gut_16SrRNA_mapping_file$New_Diet)
gut_16SrRNA_mapping_file$ID_New <- paste(gut_16SrRNA_mapping_file$samplingTime, "_", 
                                         gut_16SrRNA_mapping_file$New_Diet, "_", 
                                         gut_16SrRNA_mapping_file$samplingTank, "_", "0",
                                         gut_16SrRNA_mapping_file$sampleNumber, 
                                         sep = "")
sample_info <- read.csv("/Users/shashankgupta/Desktop/ImprovAFish/ImprovAFish/sample_info.csv", row.names = 1)
host_gut_mapping_file <- sample_info
host_gut_mapping_file <- host_gut_mapping_file[host_gut_mapping_file$ID_New %in% gut_16SrRNA_mapping_file$ID_New, ]

gut_16SrRNA_mapping_file <- gut_16SrRNA_mapping_file[gut_16SrRNA_mapping_file$ID_New %in% host_gut_mapping_file$ID_New, ]

gut_16SrRNA_mapping_file$rownames <- row.names(gut_16SrRNA_mapping_file)
row.names(gut_16SrRNA_mapping_file) <- gut_16SrRNA_mapping_file$ID_New

omics_data <- otu_table(psdata.r1)
ID_New_name_list <-  gut_16SrRNA_mapping_file$rownames
gut_16SrRNA_OTU_table <- omics_data[, colnames(omics_data) %in% ID_New_name_list]
table(colnames(gut_16SrRNA_OTU_table) ==  gut_16SrRNA_mapping_file$rownames)
## 
## TRUE 
##  142
colnames(gut_16SrRNA_OTU_table) <- gut_16SrRNA_mapping_file$ID_New
table(colnames(gut_16SrRNA_OTU_table) ==  gut_16SrRNA_mapping_file$ID_New)
## 
## TRUE 
##  142
omics_data <- gut_16SrRNA_OTU_table

WGCNA analysis

omics_data <- t(omics_data)
dim(omics_data) %>% paste(c("Samples", "OTUs"))

HellingerData<-decostand(omics_data,method = "hellinger")
omics_data <- HellingerData

dim(omics_data) %>% paste(c("Samples", "OTUs"))

powers <- c(1:10, seq(12,20,2))
suppressWarnings(sft <- pickSoftThreshold(omics_data, 
                                          powerVector = powers, 
                                          verbose = set_verbose, 
                                          networkType = "signed",
                                          corFn= chosen_parameter_set$assoc_measure))
# Find the soft thresholding power beta to which co-expression similarity is raised to calculate adjacency.
# based on the criterion of approximate scale-free topology.

idx <- min(which((-sign(sft$fitIndices[,3])*sft$fitIndices[,2]) > 0.90))
if(is.infinite(idx)){
  idx <- min(which((-sign(sft$fitIndices[,3])*sft$fitIndices[,2]) > 0.80))
  if(!is.infinite(idx)){
    st <- sft$fitIndices[idx,1]
  } else{
    idx <- which.max(-sign(sft$fitIndices[,3])*sft$fitIndices[,2])
    st <- sft$fitIndices[idx,1]
  }
} else{
  st <- sft$fitIndices[idx,1]
}


# Plot Scale independence measure and Mean connectivity measure

# Scale-free topology fit index as a function of the soft-thresholding power
data.frame(Indices = sft$fitIndices[,1],
           sfApprox = -sign(sft$fitIndices[,3])*sft$fitIndices[,2]) %>% 
  ggplot() + 
  geom_hline(yintercept = 0.9, color = "black", alpha = 0.6) + # corresponds to R^2 cut-off of 0.9
  geom_hline(yintercept = 0.8, color = "black", alpha = 0.2) + # corresponds to R^2 cut-off of 0.8
  geom_line(aes(x = Indices, y = sfApprox), color = "red", alpha = 0.1, size = 2.5) +
  geom_text(mapping = aes(x = Indices, y = sfApprox, label = Indices), color = "red", size = 4) +
  ggtitle("Scale independence") +
  xlab("Soft Threshold (power)") +
  ylab("SF Model Fit,signed R^2") + theme_bw() +
  xlim(1,20) +
  ylim(-1,1) +
  geom_segment(aes(x = st, y = 0.25, xend = st, yend = sfApprox[idx]-0.05), 
               arrow = arrow(length = unit(0.2,"cm")), 
               size = 0.5)-> scale_independence_plot 




# Mean connectivity as a function of the soft-thresholding power

data.frame(Indices = sft$fitIndices[,1],
           meanApprox = sft$fitIndices[,5]) %>% 
  ggplot() + 
  geom_line(aes(x = Indices, y = meanApprox), color = "red", alpha = 0.1, size = 2.5) +
  geom_text(mapping = aes(x = Indices, y = meanApprox, label = Indices), color = "red", size = 4) +
  xlab("Soft Threshold (power)") +
  ylab("Mean Connectivity") + theme_bw() +
  geom_segment(aes(x = st-0.4, 
                   y = sft$fitIndices$mean.k.[idx], 
                   xend = 0, 
                   yend = sft$fitIndices$mean.k.[idx]),
               arrow = arrow(length = unit(0.2,"cm")), 
               size = 0.4) +
  ggtitle(paste0("Mean connectivity: ", 
                 round(sft$fitIndices$mean.k.[idx],2))) -> mean_connectivity_plot


cowplot::plot_grid(scale_independence_plot, mean_connectivity_plot, ncol = 2, align = "h", labels = c("A", "B"), label_size = plot_labeling_size) -> si_mc_plot

si_mc_plot

allowWGCNAThreads()
modules.omics.Y <- blockwiseModules(omics_data,
                                    power = st, 
                                    networkType = "signed", 
                                    TOMType = "signed",
                                    corType = chosen_parameter_set$assoc_measure,
                                    #maxPOutliers = 0.05,
                                    #deepSplit = 4, # Default 2
                                    minModuleSize = 12, # 30
                                    #minCoreKME = 0.5,      # Default 0.5
                                    #minCoreKMESize = 2,    # Default minModuleSize/3,
                                    #minKMEtoStay = 0.5,    # Default 0.3
                                    #reassignThreshold = 0, # Default 1e-6
                                    #mergeCutHeight = 0.4,  # Default 0.15
                                    #pamStage = pam_stage, 
                                    #pamRespectsDendro = TRUE,
                                    #replaceMissingAdjacencies = TRUE,
                                    numericLabels = TRUE,
                                    saveTOMs = save_TOM,
                                    saveTOMFileBase = "TOM",
                                    verbose = 3,
                                    maxBlockSize=8000,
                                    nThreads = 10)
moduleLabels <- modules.omics.Y$colors
moduleColors <- labels2colors(modules.omics.Y$colors)

hubs <- chooseTopHubInEachModule(omics_data, modules.omics.Y$colors, power = st, omitColors = "0")

stage2results_Y <- list(modules = modules.omics.Y, 
                        hubs = hubs)
saveRDS(stage2results_Y, "/Users/shashankgupta/Desktop/ImprovAFish/github/ImprovaFish/OmicsIntegration/stage2results_Metagenomics.rds")

Convert labels to colors for plotting and check how many ASVs are there in each modules

merged_colors <- labels2colors(stage2results_Y$modules$colors)
n_modules <- unique(merged_colors) %>% length()

samples_good <- sum(stage2results_Y$modules$goodSamples) == length(stage2results_Y$modules$goodSamples)
genes_good <- sum(stage2results_Y$modules$goodGenes) == length(stage2results_Y$modules$goodGenes)

ME_good <- sum(stage2results_Y$modules$MEsOK) == length(stage2results_Y$modules$MEsOK)
table(stage2results_Y$modules$colors) %>% 
  as.data.frame() %>% 
  dplyr::rename(Module = Var1, Size = Freq) %>%
  dplyr::mutate(Module_color = labels2colors(as.numeric(as.character(Module)))) -> module_size


module_size %>% 
  ggplot(aes(x = Module, y = Size, fill = Module)) +
  geom_col(color =  "#000000") +
  ggtitle("Number of genes in each module") +
  theme(legend.position = "none") + theme_bw() +
  scale_fill_manual(values = setNames(module_size$Module_color,module_size$Module)) +
  geom_text(aes(label = Size),vjust = 0.5, hjust = -0.18, size = 3.5) +
  ylim(0, max(module_size$Size)*1.1) +
  theme(plot.margin = margin(2, 2, 2, 2, "pt")) +
  coord_flip()-> module_size_barplot

Module correlation to other modules

all(rownames(omics_data) == rownames(MEs))
dim(omics_data) %>% paste0(c(" samples", " OTUs"))
kME <- bicor(omics_data, MEs, maxPOutliers = 0.05)
dim(kME) %>% paste0(c(" OTUs", " modules"))

intra_cor <- c()
for (i in 1:ncol(omics_data)) {
  m <- stage2results_Y$modules$colors[i]
  intra_cor[i] <- kME[i, paste0("ME", m)]
  if(m != 0){
    intra_cor[i] <- kME[i, paste0("ME", m)]
  } else{
    intra_cor[i] <- NA
  }
  
}

idx <- which(is.na(intra_cor))
intra_cor <- intra_cor[-idx]
# Corr within modules
corr_within_module <- function(omics_data, modules, module_x = 1){
  idx.omics_data <- which(modules$colors == module_x)
  idx.me <- which(colnames(modules$MEs) == paste0("ME",module_x))
  kME_x <- bicor(omics_data[,idx.omics_data], modules$MEs[,idx.me], maxPOutliers = 0.05)
  kME_x
}

ggplot.list <- list()

for(m in colnames(stage2results_Y$modules$MEs))
  {
  h <- as.numeric(sub("ME","", m))
  data.frame(x = suppressWarnings(corr_within_module(omics_data = omics_data, modules = stage2results_Y$modules, module_x = h))) %>% 
    ggplot() + 
    #geom_density(aes(x = x), fill = labels2colors(h), color = "black", alpha = 0.5) + 
    geom_histogram(aes(x), fill = labels2colors(h), color = "black", alpha = 0.5, bins = 20) + 
    xlim(-1, 1) +
    xlab("ASV correlation")+ theme_bw() +
    ggtitle(paste0(prefix,m)) -> da_plot
  
  ggplot.list[[m]] <- da_plot
}

ggplot.list <- ggplot.list[ggplot.list %>% names() %>% sub("ME", "", .) %>% as.numeric() %>% order()]
cowplot::plot_grid(plotlist = ggplot.list, ncol = 6) -> density_all_plot

# comine to one plot
cowplot::plot_grid(si_mc_plot , density_eigen, ncol = 1, rel_heights = c(0.8,1)) -> part_1


cowplot::plot_grid(part_1, module_size_barplot, labels = c("", "C"), label_size = plot_labeling_size, rel_widths = c(1,0.5)) -> part_2


cowplot::plot_grid(part_2, density_all_plot, ncol = 1, rel_heights = c(0.8,1), labels = c("", "F"), label_size = plot_labeling_size)