Step 1. Transcriptomics (Liver)

Load packages

library(GO.db)
library(GenomicFeatures)
library(AnnotationDbi)
library(DESeq2)
library(dbplyr)
library(WGCNA)
library(DT)
library(clusterProfiler)
library(dplyr)
library(AnnotationHub)
library(ggplot2)
library(tibble)
library(readr)
library(png)
library(gridExtra)
library(grid)
library(stringr)
library(RColorBrewer)
library(plyr)
library(limma)

Set seed as a precaution for reproducibility as some methods are non-deterministic.

set.seed(13118)
Run_analysis <- TRUE    # if FALSE it tries to load data instead of running the module creation step.
prefix <- "h"
save_TOM <- FALSE       # Can get rather big
pam_stage <- FALSE      # Partitioning around medoids, tries to put more genes into modules where it is not directly clear from the dendrogram cutting.
set_verbose = 1         # How much detail from the WGCNA functions? A higher number means more detail.
assoc_measure = "bicor"
plot_labeling_size = 15
theme_set(theme_bw())

Import data

Loop to read in the raw count files and store them in a list

rawCounts_list <- list()
for (i in 1:4) {
  file_path <- paste0("~/Desktop/salmon_project/Gene_Counts/featurecounts_X204SC21043349-Z02-F001_", i, "_genome")
  rawCounts_list[[i]] <- read.delim(file_path)
}

rawCounts_total <- join_all(rawCounts_list, by = 'Geneid', type = 'full')

Loop to simplify the code for removing unnecessary information from column names:

patterns_to_remove <- c(
  "X.home.projects.ku.cbd.data.HoloFish.IMPROVAFISH.ANALYSES_HostRNA.5_star_2pass.X204SC21043349.Z02.F001_1.",
  "X.home.projects.ku.cbd.data.HoloFish.IMPROVAFISH.ANALYSES_HostRNA.5_star_2pass.X204SC21043349.Z02.F001_2.",
  "X.home.projects.ku.cbd.data.HoloFish.IMPROVAFISH.ANALYSES_HostRNA.5_star_2pass.X204SC21043349.Z02.F001_3.",
  "X.home.projects.ku.cbd.data.HoloFish.IMPROVAFISH.ANALYSES_HostRNA.5_star_2pass.X204SC21043349.Z02.F001_4.",
  "_Aligned.sortedByCoord.out.bam",
  "\\..*"
)

for (pattern in patterns_to_remove) {
  colnames(rawCounts_total) <- gsub(pattern, "", colnames(rawCounts_total))
}

Loop to remove columns containing specific strings:

columns_to_remove <- c("Chr", "Start", "End", "BLK", "CL1", "CP1", "CP2", "CP3")

column_indices_to_remove <- which(names(rawCounts_total) %in% columns_to_remove)

rawCounts_total <- rawCounts_total[, -column_indices_to_remove]
rawCounts_total <- data.frame(rawCounts_total[, -1], row.names = rawCounts_total[, 1])

Finally, for renaming row and column names. Also main idea of this chunk of code is make sure we have same number of samples in countData and samples in colData for DESeq2. Also the order of the samples in colData need to match the order of the samples in countData

mapping_file <- read.table("~/Desktop/salmon_project/SampleData.txt", header = T, sep = "\t", row.names = 1)
mapping_file <- subset(mapping_file, !Sample_Type %in% c("BLK" ,"CL1", "CP1", "CP2", "CP3", "Gills", "Hind_Gut"))
mapping_file$New_Diet <- toupper(mapping_file$New_Diet)
mapping_file$common <- rownames(mapping_file)
mapping_file$common <- gsub("_Lh0", "_", mapping_file$common)
rownames(mapping_file) <- mapping_file$common
colnames(rawCounts_total) <- gsub("_Lh0", "_", colnames(rawCounts_total))


rows_to_remove <- c("T0")
mapping_file <- mapping_file[!mapping_file$Time %in% rows_to_remove,]
rawCounts_total <- rawCounts_total[, !grepl("T0_ctr_", names(rawCounts_total))]

reorder_idx <- match(rownames(mapping_file), colnames(rawCounts_total))
rawCounts_total <- rawCounts_total[, reorder_idx]
all(colnames(rawCounts_total) == rownames(mapping_file))
## [1] TRUE
omics_data_host_Liver <- rawCounts_total
sample_info <- mapping_file
sample_info$common <- rownames(sample_info)
#sample_info$New_Diet <- tolower(sample_info$New_Diet)
#Rename the sample names based on new diet
sample_info$ID_New <- paste(sample_info$Time, "_",
                            sample_info$New_Diet, "_", 
                            sample_info$Tank_number, "_", "0",
                            sample_info$Sample_Number, 
                                      sep = "")
head_change <- subset(sample_info, select=c("common" ,"ID_New"))
row.names(sample_info) <- sample_info$ID_New

omics_data_host_Liver1 <- omics_data_host_Liver[, colnames(omics_data_host_Liver) %in% sample_info$common]
omics_data_host_Liver <- omics_data_host_Liver1
all(colnames(omics_data_host_Liver) == sample_info$common)
## [1] TRUE
# rename colnames of omics_data_host_Liver using new ID
df1 <- subset(sample_info, select= c("common", "ID_New"))
new_colnames <- df1$ID_New[match(colnames(omics_data_host_Liver), df1$common)]
colnames(omics_data_host_Liver) <- ifelse(is.na(new_colnames), colnames(omics_data_host_Liver), new_colnames)
all(colnames(omics_data_host_Liver) == sample_info$ID_New)
## [1] TRUE

DESeq2 analysis

df <- round(omics_data_host_Liver) %>%
  # The next steps require a data frame and round() returns a matrix
  as.data.frame() %>%
  # Only keep rows that have total counts above the cutoff
  dplyr::filter(rowSums(.) >= 50)

all(colnames(df) == rownames(sample_info))
## [1] TRUE
dds_liver <- DESeqDataSetFromMatrix(
  countData = df, # Our prepped data frame with counts
  colData = sample_info, # Data frame with annotation for our samples
  design = ~1 # Here we are not specifying a model
)
dds_liver$group <- factor(paste0(dds_liver$Time, dds_liver$New_Diet))
design(dds_liver) <- ~ group-1

dds_liver <- DESeq(dds_liver, parallel = T)
resultsNames(dds_liver)
##  [1] "groupT1CTR" "groupT1MC1" "groupT1MC2" "groupT1MN3" "groupT2CTR"
##  [6] "groupT2MC1" "groupT2MC2" "groupT2MN3" "groupT3CTR" "groupT3MC1"
## [11] "groupT3MC2" "groupT3MN3"
annot_function <- read_tsv("/Users/shashankgupta/Desktop/ImprovAFish/Salmo_salar-GCA_905237065.2_gene_annotations.tsv")
annot_function <- annot_function[, c("gene_id", "v2.gene_id.NCBI","v2.gene_name.ensembl", "v2.product")]

Time point 1 (T1)

Diet 1
res <- results(dds_liver,
               contrast = list("groupT1MC1","groupT1CTR"))
table(res$padj < 0.05)
## 
## FALSE  TRUE 
## 30197    71
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
  filter(padj <0.05)%>%
  arrange(padj)

res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )
DT::datatable(res_tbl)
# res_tbl %>% 
#   filter(log2FoldChange > 0) %>%
#   select(v2.product)
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements
Diet 2
res <- results(dds_liver,
               contrast = list("groupT1MC2","groupT1CTR"))
table(res$padj < 0.05)
## 
## FALSE  TRUE 
## 27600    12
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
  filter(padj <0.05)%>%
  arrange(padj) 
res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )
DT::datatable(res_tbl)
# res_tbl %>% 
#   filter(log2FoldChange > 0) %>%
#   select(v2.product)
# 
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements
Diet 3
res <- results(dds_liver,
               contrast = list("groupT1MN3","groupT1CTR"))
table(res$padj < 0.05)
## 
## FALSE  TRUE 
## 34250     3
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
  filter(padj <0.05)%>%
  arrange(padj) 
res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )
DT::datatable(res_tbl)
# res_tbl %>% 
#   filter(log2FoldChange > 0) %>%
#   select(v2.product)
# 
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements

Time point 2 (T2)

Diet 1
# T2
res <- results(dds_liver,
               contrast = list("groupT2MC1","groupT2CTR"))
table(res$padj < 0.05)
## 
## FALSE  TRUE 
## 34251     2
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
  filter(padj <0.05)%>%
  arrange(padj) 
res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )

DT::datatable(res_tbl)
# 
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements
Diet 2
res <- results(dds_liver,
               contrast = list("groupT2MC2","groupT2CTR"))
table(res$padj < 0.05)
## 
## FALSE  TRUE 
## 18974     5
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
  filter(padj <0.05)%>%
  arrange(padj) 
res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )

DT::datatable(res_tbl)
# 
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements
Diet 3
res <- results(dds_liver,
               contrast = list("groupT2MN3","groupT2CTR"))
table(res$padj < 0.05)
## 
## FALSE  TRUE 
## 34251     2
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
  filter(padj <0.05)%>%
  arrange(padj) 
res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )

DT::datatable(res_tbl)
# 
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements

Time point 3 (T3)

Diet 1
res <- results(dds_liver,
               contrast = list("groupT3MC1","groupT3CTR"))
table(res$padj < 0.05)
## 
## FALSE  TRUE 
## 29413   191
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
  filter(padj <0.05)%>%
  arrange(padj) 

res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )
DT::datatable(res_tbl)
# res_tbl %>% 
#   filter(log2FoldChange > 0) %>%
#   select(v2.product)
# 
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements
Diet 2
res <- results(dds_liver,
               contrast = list("groupT3MC2","groupT3CTR"))
table(res$padj < 0.05)
## 
## FALSE  TRUE 
## 27983   293
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
  filter(padj <0.05)%>%
  arrange(padj) 

res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )
DT::datatable(res_tbl)
# res_tbl %>% 
#   filter(log2FoldChange > 0) %>%
#   select(v2.product)
# 
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements
Diet 3
res <- results(dds_liver,
               contrast = list("groupT3MN3","groupT3CTR"))
table(res$padj < 0.05)
## 
## FALSE  TRUE 
## 34245     8
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
  filter(padj <0.05)%>%
  arrange(padj) 
res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )
DT::datatable(res_tbl)
# res_tbl %>% 
#   filter(log2FoldChange > 0) %>%
#   select(v2.product)
# 
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements
# Create the dataframe with the required columns
df <- data.frame(Group = character(),
                 "DEGs (up-regulated)" = integer(),
                 "DEGs (down-regulated)" = integer(),
                 "DEGs (up)-v2.products" = character(),
                 "DEGs (down)-v2.products" = character())

# Loop through each group and store the results in the dataframe
for (treatment in c("T1", "T2", "T3")) {
  for (group in c("MC1", "MC2", "MN3")) {
    sample_name <- paste0("group", treatment, group)
    CTR_sample <- paste0("group", treatment, "CTR")
    
    # Calculate results
    res <- results(dds_liver, contrast = list(sample_name, CTR_sample))
    
    # Filter the results and count the number of positive and negative elements
    res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
      filter(padj < 0.05) %>%
      arrange(padj)
    res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )
    
    # Separate the up-regulated and down-regulated genes
    up_genes <- res_tbl$log2FoldChange > 0
    down_genes <- res_tbl$log2FoldChange < 0
    up_regulated <- sum(up_genes)
    down_regulated <- sum(down_genes)
    
    # Store the results in the dataframe
    df <- rbind(df, c(sample_name, up_regulated, down_regulated,
                      paste0(res_tbl$v2.product[up_genes], collapse = ", "),
                      paste0(res_tbl$v2.product[down_genes], collapse = ", ")))
  }
}

# Rename the columns
names(df) <- c("Groups",
               "DEGs (up-regulated)",
               "DEGs (down-regulated)",
               "DEGs (up)-v2.products",
               "DEGs (down)-v2.products")

DT::datatable(df)

Step 2. Host (Liver)-microbiome interactions

WGCNA analysis

omics_data_host <- t(assay(vst(dds_liver)))
dim(omics_data_host) %>% paste(c("Samples", "Genes"))
## [1] "144 Samples" "34256 Genes"
powers <- c(c(1:10), seq(from = 12, to=20, by=2))
sft <- pickSoftThreshold(omics_data_host, 
                         powerVector = powers, 
                         verbose = set_verbose, 
                         networkType = "signed",
                         corFn= "bicor")
idx <- min(which((-sign(sft$fitIndices[,3])*sft$fitIndices[,2]) > 0.90))
if(is.infinite(idx)){
  idx <- min(which((-sign(sft$fitIndices[,3])*sft$fitIndices[,2]) > 0.80))
  if(!is.infinite(idx)){
    st <- sft$fitIndices[idx,1]
  } else{
    idx <- which.max(-sign(sft$fitIndices[,3])*sft$fitIndices[,2])
    st <- sft$fitIndices[idx,1]
  }
} else{
  st <- sft$fitIndices[idx,1]
}

data.frame(Indices = sft$fitIndices[,1],
           sfApprox = -sign(sft$fitIndices[,3])*sft$fitIndices[,2]) %>% 
  ggplot() + 
  geom_hline(yintercept = 0.9, color = "red", alpha = 0.6) + # corresponds to R^2 cut-off of 0.9
  geom_hline(yintercept = 0.8, color = "red", alpha = 0.2) + # corresponds to R^2 cut-off of 0.8
  geom_line(aes(x = Indices, y = sfApprox), color = "red", alpha = 0.1, size = 2.5) +
  geom_text(mapping = aes(x = Indices, y = sfApprox, label = Indices), color = "red", size = 4) +
  ggtitle("Scale independence") +
  xlab("Soft Threshold (power)") +
  ylab("SF Model Fit,signed R^2") +
  xlim(1,20) +
  ylim(-1,1) + theme_bw() +
  geom_segment(aes(x = st, y = 0.25, xend = st, yend = sfApprox[idx]-0.05), 
               arrow = arrow(length = unit(0.2,"cm")), 
               size = 0.5)-> scale_independence_plot 



data.frame(Indices = sft$fitIndices[,1],
           meanApprox = sft$fitIndices[,5]) %>% 
  ggplot() + 
  geom_line(aes(x = Indices, y = meanApprox), color = "red", alpha = 0.1, size = 2.5) +
  geom_text(mapping = aes(x = Indices, y = meanApprox, label = Indices), color = "red", size = 4) +
  xlab("Soft Threshold (power)") +
  ylab("Mean Connectivity") +theme_bw() +
  geom_segment(aes(x = st-0.4, 
                   y = sft$fitIndices$mean.k.[idx], 
                   xend = 0, 
                   yend = sft$fitIndices$mean.k.[idx]),
               arrow = arrow(length = unit(0.2,"cm")), 
               size = 0.4) +
  ggtitle(paste0("Mean connectivity: ", 
                 round(sft$fitIndices$mean.k.[idx],2))) -> mean_connectivity_plot


cowplot::plot_grid(scale_independence_plot, mean_connectivity_plot, ncol = 2, align = "h", labels = c("A", "B"), label_size = plot_labeling_size) -> si_mc_plot
si_mc_plot

modules.omics_X_Liver <- blockwiseModules(omics_data_host, 
                                  power = st, 
                                  networkType = "signed", 
                                  TOMType = "signed",
                                  corType = assoc_measure,
                                  #maxPOutliers = 0.05,
                                  #deepSplit = 4, # Default 2
                                  minModuleSize = 20,           # 30
                                  #minCoreKME = 0.5,            # Default 0.5
                                  #minCoreKMESize = 2,          # Default minModuleSize/3,
                                  #minKMEtoStay = 0.5,          # Default 0.3
                                  #reassignThreshold = 0,       # Default 1e-6
                                  #mergeCutHeight = 0.2,        # Default 0.15
                                  #pamStage = pam_stage, 
                                  #pamRespectsDendro = TRUE,
                                  #replaceMissingAdjacencies = TRUE,
                                  nThreads = 10,
                                  numericLabels = TRUE,
                                  saveTOMs = FALSE,
                                  saveTOMFileBase = "HOST_TOM",
                                  verbose = 1,
                                  maxBlockSize=8000)
hubs <- chooseTopHubInEachModule(omics_data_host, modules.omics_X_Liver$colors, power = st, omitColors = "0")

stage2results_X_Liver <- list(modules = modules.omics_X_Liver, 
                      hubs = hubs)

saveRDS(stage2results_X_Liver, "/Users/shashankgupta/Desktop/ImprovAFish/github/ImprovaFish/OmicsIntegration/stage2results_Transcriptomics_Liver.rds")
# Convert labels to colors for plotting
merged_colors <- labels2colors(stage2results_X_Liver$modules$colors)
n_modules <- unique(merged_colors) %>% length()

samples_good <- sum(stage2results_X_Liver$modules$goodSamples) == length(stage2results_X_Liver$modules$goodSamples)
genes_good <- sum(stage2results_X_Liver$modules$goodGenes) == length(stage2results_X_Liver$modules$goodGenes)

ME_good <- sum(stage2results_X_Liver$modules$MEsOK) == length(stage2results_X_Liver$modules$MEsOK)
table(stage2results_X_Liver$modules$colors) %>% 
  as.data.frame() %>% 
  dplyr::rename(Module = Var1, Size = Freq) %>%
  dplyr::mutate(Module_color = labels2colors(as.numeric(as.character(Module)))) -> module_size

module_size %>% 
  ggplot(aes(x = Module, y = Size, fill = Module)) +
  geom_col(color =  "#000000") +
  ggtitle("Number of genes in each module") +
  theme(legend.position = "none") + 
  scale_fill_manual(values = setNames(module_size$Module_color,module_size$Module)) +
  geom_text(aes(label = Size),vjust = 0.5, hjust = -0.18, size = 3.5) +
  ylim(0, max(module_size$Size)*1.1) + theme_bw() +
  theme(plot.margin = margin(2, 2, 2, 2, "pt")) +
  coord_flip() -> module_size_barplot
module_size_barplot + theme(legend.position = "none") -> module_size_barplot
module_size_barplot

MEs <- stage2results_X_Liver$modules$MEs
# Module correlation to other modules
MEs_R <- bicor(MEs, MEs, maxPOutliers = 0.05)

idx.r <- which(rownames(MEs_R) == "ME0")
idx.c <- which(colnames(MEs_R) == "ME0")
MEs_R_noME0 <- MEs_R[-idx.r, -idx.c]

MEs_R_noME0[upper.tri(MEs_R_noME0)] %>% 
  as.data.frame() %>% 
  dplyr::rename("correlation" = ".") %>% 
  ggplot(aes(correlation)) + 
  geom_histogram(bins = 20) +
  #geom_density() + 
  xlim(-1, 1) +
  ggtitle(paste0(prefix,"ME correlation\n w/o ",prefix ,"ME0")) -> MEs_R_density

pheatmap::pheatmap(MEs_R_noME0, color = colorRampPalette(c("Blue", "White", "Red"))(100),
                   silent = T, 
                   breaks = seq(-1,1,length.out = 101),
                   treeheight_row = 10, 
                   treeheight_col = 10,
                   main = paste0(prefix,"ME correlation heatmap w/o ", prefix ,"ME0"),
                   labels_row = rep("", length(rownames(MEs_R))), #paste0(prefix, rownames(MEs_R)),
                   labels_col = rep("", length(colnames(MEs_R))) #paste0(prefix, colnames(MEs_R))
) -> MEs_R_Corr

cowplot::plot_grid(MEs_R_density, MEs_R_Corr$gtable, labels = c("D", "E"), label_size = plot_labeling_size, rel_widths = c(0.6, 1)) -> density_eigen

all(rownames(omics_data_host) == rownames(MEs))
## [1] TRUE
dim(omics_data_host) %>% paste0(c(" samples", " genes"))
## [1] "144 samples" "34256 genes"
kME <- bicor(omics_data_host, MEs, maxPOutliers = 0.05)
dim(kME) %>% paste0(c(" genes", " modules"))
## [1] "34256 genes" "77 modules"
# Corr within modules
corr_within_module <- function(omics_data_host, modules, module_x = 1){
  idx.omics_data_host <- which(modules$colors == module_x)
  idx.me <- which(colnames(modules$MEs) == paste0("ME",module_x))
  kME_x <- bicor(omics_data_host[,idx.omics_data_host], modules$MEs[,idx.me], maxPOutliers = 0.05)
  kME_x
}

ggplot.list <- list()

modules <- 
  colnames(stage2results_X_Liver$modules$MEs)[colnames(stage2results_X_Liver$modules$MEs) %>% sub("ME", "", .) %>% as.numeric() %>% order()]

for(m in colnames(stage2results_X_Liver$modules$MEs)){
  h <- as.numeric(sub("ME","", m))
  data.frame(x = suppressWarnings(corr_within_module(omics_data_host = omics_data_host, modules = stage2results_X_Liver$modules, module_x = h))) %>% 
    ggplot() + 
    #geom_density(aes(x = x), fill = labels2colors(h), color = "black", alpha = 0.5) +
    geom_histogram(aes(x), fill = labels2colors(h), color = "black", alpha = 0.5, bins = 20) + 
    xlim(-1, 1) +
    xlab("Gene correlation") +
    ggtitle(paste0(prefix,m)) -> da_plot
  
  ggplot.list[[m]] <- da_plot
}
ggplot.list <- ggplot.list[ggplot.list %>% names() %>% sub("ME", "", .) %>% as.numeric() %>% order()]
cowplot::plot_grid(plotlist = ggplot.list) -> density_all_plot

# comine to one plot
cowplot::plot_grid(si_mc_plot , density_eigen, ncol = 1, rel_heights = c(0.8,1)) -> part_1
cowplot::plot_grid(part_1, module_size_barplot, labels = c("", "C"), label_size = plot_labeling_size, rel_widths = c(1,0.5)) -> part_2
cowplot::plot_grid(part_2, density_all_plot, ncol = 1, rel_heights = c(0.8,1), labels = c("", "F"), label_size = plot_labeling_size) -> part_3
part_3

This code takes the results of a omics data and extracts the “hubs”, or most highly connected genes in the network, for each module.

stage2results_X_Liver$hubs %>% 
  as.data.frame() %>% 
  dplyr::rename("gene_id" = ".") %>% 
  tibble::rownames_to_column(var = "Module") -> top_genes

DT::datatable(top_genes)

Statistical analysis

Linear regression analysis on the module eigengenes (principal components) from omics data, using the limma package. The sample information is loaded and the factors ‘Time’, ‘New_Diet’, and ‘Day’ are created based on the sample IDs. The design matrix for the regression is created using the ‘Day’ factor, and a linear model fit is performed on the transposed module eigengenes. The regression results are then run through an empirical Bayesian analysis. The topTable function is used to extract the regression statistics and create a data frame with the module names as the row names.

#modules.omics_X_Liver.backup <- modules.omics_X_Liver 
module_eigengenes <- modules.omics_X_Liver$MEs
all.equal(sample_info$ID_New, rownames(module_eigengenes))
## [1] TRUE
sample_info %>% 
  mutate(Time = factor(Time)) %>% 
  mutate(New_Diet = factor(New_Diet)) %>%
  mutate(Day = ifelse(Time == "T1", paste0("T1_", New_Diet),
                      ifelse(Time == "T2", paste0("T2_", New_Diet), paste0("T3_", New_Diet)))) -> meta
DT::datatable(meta)
des_mat <- model.matrix(~ meta$Day)
fit <- limma::lmFit(t(module_eigengenes), design = des_mat)
fit <- limma::eBayes(fit)

stats_df <- limma::topTable(fit, number = ncol(module_eigengenes)) %>%
  tibble::rownames_to_column("module")

DT::datatable(stats_df)

Let’s make plot of module 17

module_df <- module_eigengenes %>%
  tibble::rownames_to_column("ID_New") %>%
  dplyr::inner_join(meta %>%
                      dplyr::select(ID_New, Day, Time, New_Diet),
                    by = c("ID_New" = "ID_New"))


ggplot(module_df, aes(x = Time, y = ME17, color = Day)) +
  geom_boxplot(width = 0.2, outlier.shape = NA) +
  ggforce::geom_sina(maxwidth = 0.3) +
  theme_classic() + facet_wrap("New_Diet")

ggplot(module_df, aes(x = Day, y = ME17, color = Time)) +
  geom_boxplot(width = 0.2, outlier.shape = NA) +
  ggforce::geom_sina(maxwidth = 0.3) +
  theme_classic()

What genes are a part of module 21?

gene_module_key <- tibble::enframe(modules.omics_X_Liver$colors, name = "gene", value = "module") %>%
  # Let's add the `ME` part so its more clear what these numbers are and it matches elsewhere
  dplyr::mutate(module = paste0("ME", module))

# gene_module_key %>%
#   dplyr::filter(module == "ME21") 
DT::datatable(gene_module_key)

Functional analysis

The code in R connects to the AnnotationHub package and selects OrgDb annotations for the species “Salmo salar”. It creates two objects, SsalOrg, by subsetting and extracting the desired annotations from the ah object.

# get OrgDb from AnnotationHub
ah <- AnnotationHub()
SsalOrg <- subset(ah, ah$species == "Salmo salar" & ah$rdataclass=="OrgDb")
SsalOrg <- ah[["AH107424"]]

This script performs gene enrichment analysis on multiple modules of genes. The modules of genes are generated from omics data, and the gene annotations are read from a csv file. The probes from the omics data are matched with the gene IDs from the annotations, and the LocusLink IDs are obtained. The unique modules are then selected, and their LocusLink IDs are written into separate text files. Finally, gene ontology (GO) and KEGG pathway analysis are performed for each module using the enrichGO and enrichKEGG functions, respectively. The results are then visualized using the dotplot function.

moduleLabels <- modules.omics_X_Liver$colors
moduleColors <- labels2colors(modules.omics_X_Liver$colors)
table(moduleColors)
## moduleColors
##   antiquewhite4         bisque4           black            blue           blue2 
##              45              64             416             790              20 
##           brown          brown2          brown4          coral1          coral2 
##             649              21              70              46              40 
##            cyan       darkgreen        darkgrey     darkmagenta  darkolivegreen 
##             306             206             192             107             111 
## darkolivegreen4      darkorange     darkorange2         darkred   darkseagreen4 
##              22             167              73             207              47 
##   darkslateblue   darkturquoise      firebrick4     floralwhite           green 
##              58             200              23              74             485 
##     greenyellow            grey          grey60       honeydew1      indianred4 
##             373           21281             258              47              23 
##           ivory  lavenderblush3      lightcoral       lightcyan      lightcyan1 
##              75              48              31             260              87 
##      lightgreen      lightpink4  lightsteelblue lightsteelblue1     lightyellow 
##             246              48              31              89             230 
##         magenta          maroon    mediumorchid   mediumpurple2   mediumpurple3 
##             396              49              40              32              91 
##    midnightblue    navajowhite2          orange      orangered3      orangered4 
##             289              49             172              36              92 
##   paleturquoise  palevioletred3            pink            plum           plum1 
##             134              49             409              36              95 
##           plum2          purple             red       royalblue     saddlebrown 
##              51             387             433             214             152 
##          salmon         salmon4         sienna3         skyblue        skyblue1 
##             336              50             105             152              37 
##        skyblue2        skyblue3       steelblue             tan        thistle1 
##              40             100             151             353              51 
##        thistle2       turquoise          violet           white          yellow 
##              51            1014             113             159             632 
##         yellow4     yellowgreen 
##              39             101
# table(moduleLabels)
# table(moduleColors, moduleLabels)

MEs <- modules.omics_X_Liver$MEs
geneTree <- modules.omics_X_Liver$dendrograms[[1]]


#https://gitlab.com/garethgillard/ssalv3annotation
annot1 <- read.csv("/Users/shashankgupta/Desktop/ImprovAFish/github/Input/Salmo_salar-GCA_905237065.2_gene_annotations.csv", header = TRUE)

probes <- colnames(omics_data_host) 
probes2annot <- match(probes, annot1$gene_id)
allLLIDs <- annot1$v2.gene_id.NCBI[probes2annot]

intModules <- unique(moduleColors)
for (module in intModules) 
{
  # Select module probes 
  modGenes = (moduleColors==module) 
  # Get their entrez ID codes 
  modLLIDs = allLLIDs[modGenes]
  # Write them into a file 
  fileName = paste("LocusLinkIDs-", module, ".txt", sep=""); 
  write.table(as.character(modLLIDs), file = fileName, row.names = FALSE, col.names = FALSE) 
}
fileName = paste("LocusLinkIDs-all.txt", sep="");
write.table(as.character(allLLIDs), file = fileName, row.names = FALSE, col.names = FALSE)

GO over-representation analysis

setwd("/Users/shashankgupta/Desktop/ImprovAFish/github/ImprovaFish/Transcriptomics/intermediate_files/Liver/")
all.df <- read.table("LocusLinkIDs-all.txt")
all.df$V1 <- as.character(all.df$V1)
# Get unique module colors
list_names <- unique(moduleColors)

for (i in list_names) {
  # Read LocusLinkIDs file
  assign(paste0(i,"_df"), read.table(paste0("LocusLinkIDs-",i,".txt")))
  # Enrich GO terms
  assign(paste0("resGO_",i), enrichGO(gene = get(paste0(i,"_df"))$V1, ont = "BP", universe = all.df$V1, OrgDb = SsalOrg, pvalueCutoff  = 0.05,qvalueCutoff  = 0.05))
  # Plot GO terms using dotplot
  assign(paste0("resGO_dotplot_",i), dotplot(get(paste0("resGO_",i))))
}

Save all the plots

resGO_dot_plots <- ls(pattern="resGO_dotplot_*")
for (i in 1:length(resGO_dot_plots)) {
  tryCatch({
    png(paste0(resGO_dot_plots[i], ".png"))
    print(get(resGO_dot_plots[i]))
    dev.off()
  }, error = function(e) {
    #print(paste("Error printing", resGO_dot_plots[i], ":", e))
    file.remove(paste0(resGO_dot_plots[i], ".png"))
  })
}
png_files <- list.files(pattern = "^resGO_dotplot_.*\\.png$")
raster_list <- lapply(png_files, readPNG)
plot <- grid.arrange(grobs = lapply(raster_list, rasterGrob), ncol = 3)
<div style="text-align:center; color:red; font-size: 14pt;">Gene Ontology (GO) analysis</div>

Gene Ontology (GO) analysis

dev.off()
## quartz_off_screen 
##                 3
moduleColors <- modules.omics_X_Liver$colors
mergedMEs <- modules.omics_X_Liver$MEs
colorOrder = c("grey", standardColors(50));
moduleLabels = match(moduleColors, colorOrder)-1;
MEs <- mergedMEs
MEs <- MEs[ , -which(names(MEs) %in% c("ME0"))]
modNames <- substring(names(MEs), 3)

# Correlating each genes expression profile with the module eigengenes in order to create module gene sets
geneModuleMembership <- as.data.frame(cor(omics_data_host, MEs, use = "p"))

# Iteratively creating a list of module genesets to test. These are in ensembl ids
moduleGeneSets<-lapply(modNames,function(module){
  column = match(module, modNames)
  moduleGenes = moduleColors==module
  rownames(geneModuleMembership[moduleGenes,])
})
names(moduleGeneSets)<-modNames

# Trimming the module gene sets so that the final two digits after the "." are removed
moduleGeneSets.trimmed<-lapply(moduleGeneSets,function(x){
  str_split_fixed(x,"\\.",2)[,1]
})

moduleGeneSets.Entrez <- lapply(moduleGeneSets.trimmed, function(x) {
  x <- match(x, annot1$gene_id)
  x <- annot1$v2.gene_id.NCBI[x]
  return(x)
})

ck<-compareCluster(geneCluster=moduleGeneSets.Entrez,
                   fun="enrichGO",
                   OrgDb = SsalOrg,
                   pvalueCutoff  = 0.05,
                   qvalueCutoff  = 0.05, 
                   ont = "BP",
                   universe = all.df$V1)
dotplot(ck)

cols  <- c(brewer.pal(8,"Set1"), brewer.pal(7,"Dark2"),brewer.pal(7,"Set2"),brewer.pal(12,"Set3"),brewer.pal(7,"Accent"),brewer.pal(12,"Paired"),"gray")

g<- cnetplot(ck,node_label="category")
g + scale_fill_manual(values =cols)