Step 1. Transcriptomics

Load packages

library(GO.db)
library(GenomicFeatures)
library(AnnotationDbi)
library(DESeq2)
library(dbplyr)
library(WGCNA)
library(DT)
library(clusterProfiler)
library(dplyr)
library(AnnotationHub)
library(ggplot2)
library(tibble)
library(readr)
library(png)
library(gridExtra)
library(grid)
library(stringr)
library(RColorBrewer)
dir.create("/Users/shashankgupta/Desktop/ImprovAFish/github/ImprovaFish/Transcriptomics/intermediate_files/Gut")
setwd("/Users/shashankgupta/Desktop/ImprovAFish/github/ImprovaFish/Transcriptomics/intermediate_files/Gut")

Set seed as a precaution for reproducibility as some methods are non-deterministic.

set.seed(13118)
Run_analysis <- TRUE    # if FALSE it tries to load data instead of running the module creation step.
prefix <- "h"
save_TOM <- FALSE       # Can get rather big
pam_stage <- FALSE      # Partitioning around medoids, tries to put more genes into modules where it is not directly clear from the dendrogram cutting.
set_verbose = 1         # How much detail from the WGCNA functions? A higher number means more detail.
assoc_measure = "bicor"
plot_labeling_size = 15
theme_set(theme_bw())

Import data

host_gut_rawCounts_total1 <- read.table("/Users/shashankgupta/Desktop/ImprovAFish/host_data/host_gut_count.txt")
sample_info <- read.csv("/Users/shashankgupta/Desktop/ImprovAFish/ImprovAFish/sample_info.csv", row.names = 1)
omics_data_host <- host_gut_rawCounts_total1
host_gut_mapping_file <- sample_info

Initial formatting

Main idea of this chunk of code is make sure we have same number of samples in countData and samples in colData for DESeq2. Also the order of the samples in colData need to match the order of the samples in countData

#sample_info$New_Diet <- tolower(sample_info$New_Diet)
#Rename the sample names based on new diet
sample_info$ID_New <- paste(sample_info$Time, "_",
                            sample_info$New_Diet, "_", 
                            sample_info$Tank_number, "_", "0",
                            sample_info$Sample_Number, 
                                      sep = "")
head_change <- subset(sample_info, select=c("common" ,"ID_New"))
row.names(sample_info) <- sample_info$ID_New

omics_data_host1 <- omics_data_host[, colnames(omics_data_host) %in% sample_info$ID_New]
omics_data_host <- omics_data_host1
all(colnames(omics_data_host) == sample_info$ID_New)
## [1] FALSE
# rename colnames of omics_data_host using new ID
df1 <- subset(sample_info, select= c("common", "ID_New"))
omics_data_host %>% 
  rename_with(~deframe(df1)[.x], .cols = df1$ID_New) %>% 
  select(any_of(df1$ID_New)) -> omics_data_host_new
omics_data_host <- omics_data_host_new

all(colnames(omics_data_host) == sample_info$ID_New)
## [1] TRUE
reorder_idx <- match(sample_info$ID_New, colnames(omics_data_host))
omics_data_host <- omics_data_host[ , reorder_idx]
all(colnames(omics_data_host) == sample_info$ID_New)
## [1] TRUE

DESeq2 analysis

df <- round(omics_data_host) %>%
  # The next steps require a data frame and round() returns a matrix
  as.data.frame() %>%
  # Only keep rows that have total counts above the cutoff
  dplyr::filter(rowSums(.) >= 50)

all(colnames(df) == rownames(sample_info))
## [1] TRUE
dds <- DESeqDataSetFromMatrix(
  countData = df, # Our prepped data frame with counts
  colData = sample_info, # Data frame with annotation for our samples
  design = ~1 # Here we are not specifying a model
)
dds$group <- factor(paste0(dds$Time, dds$New_Diet))
design(dds) <- ~ group-1

dds <- DESeq(dds, parallel = T)
resultsNames(dds)
##  [1] "groupT1CTR" "groupT1MC1" "groupT1MC2" "groupT1MN3" "groupT2CTR"
##  [6] "groupT2MC1" "groupT2MC2" "groupT2MN3" "groupT3CTR" "groupT3MC1"
## [11] "groupT3MC2" "groupT3MN3"
annot_function <- read_tsv("/Users/shashankgupta/Desktop/ImprovAFish/Salmo_salar-GCA_905237065.2_gene_annotations.tsv")
annot_function <- annot_function[, c("gene_id", "v2.gene_id.NCBI","v2.gene_name.ensembl", "v2.product")]

Time point 1 (T1)

Diet 1
res <- results(dds,
               contrast = list("groupT1MC1","groupT1CTR"))
table(res$padj < 0.05)
## 
## FALSE  TRUE 
## 30430    12
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
  filter(padj <0.05)%>%
  arrange(padj)

res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )
DT::datatable(res_tbl)
# res_tbl %>% 
#   filter(log2FoldChange > 0) %>%
#   select(v2.product)
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements
Diet 2
res <- results(dds,
               contrast = list("groupT1MC2","groupT1CTR"))
table(res$padj < 0.05)
## 
## FALSE  TRUE 
## 36872     5
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
  filter(padj <0.05)%>%
  arrange(padj) 
res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )
DT::datatable(res_tbl)
# res_tbl %>% 
#   filter(log2FoldChange > 0) %>%
#   select(v2.product)
# 
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements
Diet 3
res <- results(dds,
               contrast = list("groupT1MN3","groupT1CTR"))
table(res$padj < 0.05)
## 
## FALSE  TRUE 
## 36870     7
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
  filter(padj <0.05)%>%
  arrange(padj) 
res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )
DT::datatable(res_tbl)
# res_tbl %>% 
#   filter(log2FoldChange > 0) %>%
#   select(v2.product)
# 
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements

Time point 2 (T2)

Diet 1
# T2
res <- results(dds,
               contrast = list("groupT2MC1","groupT2CTR"))
table(res$padj < 0.05)
## 
## FALSE  TRUE 
## 36872     5
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
  filter(padj <0.05)%>%
  arrange(padj) 
res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )

DT::datatable(res_tbl)
# 
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements
Diet 2
res <- results(dds,
               contrast = list("groupT2MC2","groupT2CTR"))
table(res$padj < 0.05)
## 
## FALSE  TRUE 
## 36875     2
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
  filter(padj <0.05)%>%
  arrange(padj) 
res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )

DT::datatable(res_tbl)
# 
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements
Diet 3
res <- results(dds,
               contrast = list("groupT2MN3","groupT2CTR"))
table(res$padj < 0.05)
## 
## FALSE  TRUE 
## 36876     1
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
  filter(padj <0.05)%>%
  arrange(padj) 
res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )

DT::datatable(res_tbl)
# 
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements

Time point 3 (T3)

Diet 1
res <- results(dds,
               contrast = list("groupT3MC1","groupT3CTR"))
table(res$padj < 0.05)
## 
## FALSE  TRUE 
## 36873     4
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
  filter(padj <0.05)%>%
  arrange(padj) 

res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )
DT::datatable(res_tbl)
# res_tbl %>% 
#   filter(log2FoldChange > 0) %>%
#   select(v2.product)
# 
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements
Diet 2
res <- results(dds,
               contrast = list("groupT3MC2","groupT3CTR"))
table(res$padj < 0.05)
## 
## FALSE  TRUE 
## 36135    27
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
  filter(padj <0.05)%>%
  arrange(padj) 

res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )
DT::datatable(res_tbl)
# res_tbl %>% 
#   filter(log2FoldChange > 0) %>%
#   select(v2.product)
# 
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements
Diet 3
res <- results(dds,
               contrast = list("groupT3MN3","groupT3CTR"))
table(res$padj < 0.05)
## 
## FALSE  TRUE 
## 36870     7
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
  filter(padj <0.05)%>%
  arrange(padj) 
res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )
DT::datatable(res_tbl)
# res_tbl %>% 
#   filter(log2FoldChange > 0) %>%
#   select(v2.product)
# 
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements
# loop
# Create the dataframe
df <- data.frame(Group=character(), a1=character(), a2=character(), a3=character())

# Loop through each group and store the results in the dataframe
for(treatment in c("T1", "T2", "T3")){
  for(group in c("MC1","MC2","MN3")){
    sample_name <- paste0("group",treatment,group)
    CTR_sample <- paste0("group",treatment,"CTR")
    
    # Calculate results
    res <- results(dds, contrast = list(sample_name,CTR_sample))
    
    # Filter the results and count the number of positive elements
    res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
      filter(padj <0.05)%>%
      arrange(padj)
    a <- length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0])
    
    # Store the results in the dataframe
    df <- rbind(df, c(sample_name,a))
  }
}
names(df) <- c("Groups", "DEGs (up-regulated)")
df
##       Groups DEGs (up-regulated)
## 1 groupT1MC1                   4
## 2 groupT1MC2                   2
## 3 groupT1MN3                   3
## 4 groupT2MC1                   1
## 5 groupT2MC2                   1
## 6 groupT2MN3                   0
## 7 groupT3MC1                   3
## 8 groupT3MC2                  24
## 9 groupT3MN3                   5

Step 2. Host-microbiome interactions

WGCNA analysis

omics_data_host <- t(assay(vst(dds)))
dim(omics_data_host) %>% paste(c("Samples", "Genes"))
## [1] "142 Samples" "36878 Genes"
powers <- c(c(1:10), seq(from = 12, to=20, by=2))
sft <- pickSoftThreshold(omics_data_host, 
                         powerVector = powers, 
                         verbose = set_verbose, 
                         networkType = "signed",
                         corFn= "bicor")
idx <- min(which((-sign(sft$fitIndices[,3])*sft$fitIndices[,2]) > 0.90))
if(is.infinite(idx)){
  idx <- min(which((-sign(sft$fitIndices[,3])*sft$fitIndices[,2]) > 0.80))
  if(!is.infinite(idx)){
    st <- sft$fitIndices[idx,1]
  } else{
    idx <- which.max(-sign(sft$fitIndices[,3])*sft$fitIndices[,2])
    st <- sft$fitIndices[idx,1]
  }
} else{
  st <- sft$fitIndices[idx,1]
}

data.frame(Indices = sft$fitIndices[,1],
           sfApprox = -sign(sft$fitIndices[,3])*sft$fitIndices[,2]) %>% 
  ggplot() + 
  geom_hline(yintercept = 0.9, color = "red", alpha = 0.6) + # corresponds to R^2 cut-off of 0.9
  geom_hline(yintercept = 0.8, color = "red", alpha = 0.2) + # corresponds to R^2 cut-off of 0.8
  geom_line(aes(x = Indices, y = sfApprox), color = "red", alpha = 0.1, size = 2.5) +
  geom_text(mapping = aes(x = Indices, y = sfApprox, label = Indices), color = "red", size = 4) +
  ggtitle("Scale independence") +
  xlab("Soft Threshold (power)") +
  ylab("SF Model Fit,signed R^2") +
  xlim(1,20) +
  ylim(-1,1) + theme_bw() +
  geom_segment(aes(x = st, y = 0.25, xend = st, yend = sfApprox[idx]-0.05), 
               arrow = arrow(length = unit(0.2,"cm")), 
               size = 0.5)-> scale_independence_plot 



data.frame(Indices = sft$fitIndices[,1],
           meanApprox = sft$fitIndices[,5]) %>% 
  ggplot() + 
  geom_line(aes(x = Indices, y = meanApprox), color = "red", alpha = 0.1, size = 2.5) +
  geom_text(mapping = aes(x = Indices, y = meanApprox, label = Indices), color = "red", size = 4) +
  xlab("Soft Threshold (power)") +
  ylab("Mean Connectivity") +theme_bw() +
  geom_segment(aes(x = st-0.4, 
                   y = sft$fitIndices$mean.k.[idx], 
                   xend = 0, 
                   yend = sft$fitIndices$mean.k.[idx]),
               arrow = arrow(length = unit(0.2,"cm")), 
               size = 0.4) +
  ggtitle(paste0("Mean connectivity: ", 
                 round(sft$fitIndices$mean.k.[idx],2))) -> mean_connectivity_plot


cowplot::plot_grid(scale_independence_plot, mean_connectivity_plot, ncol = 2, align = "h", labels = c("A", "B"), label_size = plot_labeling_size) -> si_mc_plot
si_mc_plot

modules.omics_X <- blockwiseModules(omics_data_host, 
                                  power = st, 
                                  networkType = "signed", 
                                  TOMType = "signed",
                                  corType = assoc_measure,
                                  #maxPOutliers = 0.05,
                                  #deepSplit = 4, # Default 2
                                  minModuleSize = 20,           # 30
                                  #minCoreKME = 0.5,            # Default 0.5
                                  #minCoreKMESize = 2,          # Default minModuleSize/3,
                                  #minKMEtoStay = 0.5,          # Default 0.3
                                  #reassignThreshold = 0,       # Default 1e-6
                                  #mergeCutHeight = 0.2,        # Default 0.15
                                  #pamStage = pam_stage, 
                                  #pamRespectsDendro = TRUE,
                                  #replaceMissingAdjacencies = TRUE,
                                  nThreads = 10,
                                  numericLabels = TRUE,
                                  saveTOMs = FALSE,
                                  saveTOMFileBase = "HOST_TOM",
                                  verbose = 3,
                                  maxBlockSize=8000)
hubs <- chooseTopHubInEachModule(omics_data_host, modules.omics_X$colors, power = st, omitColors = "0")

stage2results_X <- list(modules = modules.omics_X, 
                      hubs = hubs)

saveRDS(stage2results_X, "/Users/shashankgupta/Desktop/ImprovAFish/github/ImprovaFish/OmicsIntegration/stage2results_Transcriptomics.rds")
# Convert labels to colors for plotting
merged_colors <- labels2colors(stage2results_X$modules$colors)
n_modules <- unique(merged_colors) %>% length()

samples_good <- sum(stage2results_X$modules$goodSamples) == length(stage2results_X$modules$goodSamples)
genes_good <- sum(stage2results_X$modules$goodGenes) == length(stage2results_X$modules$goodGenes)

ME_good <- sum(stage2results_X$modules$MEsOK) == length(stage2results_X$modules$MEsOK)
table(stage2results_X$modules$colors) %>% 
  as.data.frame() %>% 
  dplyr::rename(Module = Var1, Size = Freq) %>%
  dplyr::mutate(Module_color = labels2colors(as.numeric(as.character(Module)))) -> module_size

module_size %>% 
  ggplot(aes(x = Module, y = Size, fill = Module)) +
  geom_col(color =  "#000000") +
  ggtitle("Number of genes in each module") +
  theme(legend.position = "none") + 
  scale_fill_manual(values = setNames(module_size$Module_color,module_size$Module)) +
  geom_text(aes(label = Size),vjust = 0.5, hjust = -0.18, size = 3.5) +
  ylim(0, max(module_size$Size)*1.1) + theme_bw() +
  theme(plot.margin = margin(2, 2, 2, 2, "pt")) +
  coord_flip()-> module_size_barplot

module_size_barplot

MEs <- stage2results_X$modules$MEs
# Module correlation to other modules
MEs_R <- bicor(MEs, MEs, maxPOutliers = 0.05)

idx.r <- which(rownames(MEs_R) == "ME0")
idx.c <- which(colnames(MEs_R) == "ME0")
MEs_R_noME0 <- MEs_R[-idx.r, -idx.c]

MEs_R_noME0[upper.tri(MEs_R_noME0)] %>% 
  as.data.frame() %>% 
  dplyr::rename("correlation" = ".") %>% 
  ggplot(aes(correlation)) + 
  geom_histogram(bins = 20) +
  #geom_density() + 
  xlim(-1, 1) +
  ggtitle(paste0(prefix,"ME correlation\n w/o ",prefix ,"ME0")) -> MEs_R_density

pheatmap::pheatmap(MEs_R_noME0, color = colorRampPalette(c("Blue", "White", "Red"))(100),
                   silent = T, 
                   breaks = seq(-1,1,length.out = 101),
                   treeheight_row = 10, 
                   treeheight_col = 10,
                   main = paste0(prefix,"ME correlation heatmap w/o ", prefix ,"ME0"),
                   labels_row = rep("", length(rownames(MEs_R))), #paste0(prefix, rownames(MEs_R)),
                   labels_col = rep("", length(colnames(MEs_R))) #paste0(prefix, colnames(MEs_R))
) -> MEs_R_Corr

cowplot::plot_grid(MEs_R_density, MEs_R_Corr$gtable, labels = c("D", "E"), label_size = plot_labeling_size, rel_widths = c(0.6, 1)) -> density_eigen

all(rownames(omics_data_host) == rownames(MEs))
## [1] TRUE
dim(omics_data_host) %>% paste0(c(" samples", " genes"))
## [1] "142 samples" "36878 genes"
kME <- bicor(omics_data_host, MEs, maxPOutliers = 0.05)
dim(kME) %>% paste0(c(" genes", " modules"))
## [1] "36878 genes" "25 modules"
# Corr within modules
corr_within_module <- function(omics_data_host, modules, module_x = 1){
  idx.omics_data_host <- which(modules$colors == module_x)
  idx.me <- which(colnames(modules$MEs) == paste0("ME",module_x))
  kME_x <- bicor(omics_data_host[,idx.omics_data_host], modules$MEs[,idx.me], maxPOutliers = 0.05)
  kME_x
}

ggplot.list <- list()

modules <- 
  colnames(stage2results_X$modules$MEs)[colnames(stage2results_X$modules$MEs) %>% sub("ME", "", .) %>% as.numeric() %>% order()]

for(m in colnames(stage2results_X$modules$MEs)){
  h <- as.numeric(sub("ME","", m))
  data.frame(x = suppressWarnings(corr_within_module(omics_data_host = omics_data_host, modules = stage2results_X$modules, module_x = h))) %>% 
    ggplot() + 
    #geom_density(aes(x = x), fill = labels2colors(h), color = "black", alpha = 0.5) +
    geom_histogram(aes(x), fill = labels2colors(h), color = "black", alpha = 0.5, bins = 20) + 
    xlim(-1, 1) +
    xlab("Gene correlation") +
    ggtitle(paste0(prefix,m)) -> da_plot
  
  ggplot.list[[m]] <- da_plot
}
ggplot.list <- ggplot.list[ggplot.list %>% names() %>% sub("ME", "", .) %>% as.numeric() %>% order()]
cowplot::plot_grid(plotlist = ggplot.list) -> density_all_plot

# comine to one plot
cowplot::plot_grid(si_mc_plot , density_eigen, ncol = 1, rel_heights = c(0.8,1)) -> part_1
cowplot::plot_grid(part_1, module_size_barplot, labels = c("", "C"), label_size = plot_labeling_size, rel_widths = c(1,0.5)) -> part_2
cowplot::plot_grid(part_2, density_all_plot, ncol = 1, rel_heights = c(0.8,1), labels = c("", "F"), label_size = plot_labeling_size) -> part_3
part_3

This code takes the results of a omics data and extracts the “hubs”, or most highly connected genes in the network, for each module.

stage2results_X$hubs %>% 
  as.data.frame() %>% 
  dplyr::rename("gene_id" = ".") %>% 
  tibble::rownames_to_column(var = "Module") -> top_genes

DT::datatable(top_genes)

Statistical analysis

Linear regression analysis on the module eigengenes (principal components) from omics data, using the limma package. The sample information is loaded and the factors ‘Time’, ‘New_Diet’, and ‘Day’ are created based on the sample IDs. The design matrix for the regression is created using the ‘Day’ factor, and a linear model fit is performed on the transposed module eigengenes. The regression results are then run through an empirical Bayesian analysis. The topTable function is used to extract the regression statistics and create a data frame with the module names as the row names.

#modules.omics_X.backup <- modules.omics_X 
module_eigengenes <- modules.omics_X$MEs
all.equal(sample_info$ID_New, rownames(module_eigengenes))
## [1] TRUE
sample_info %>% 
  mutate(Time = factor(Time)) %>% 
  mutate(New_Diet = factor(New_Diet)) %>%
  mutate(Day = ifelse(Time == "T1", paste0("T1_", New_Diet),
                      ifelse(Time == "T2", paste0("T2_", New_Diet), paste0("T3_", New_Diet)))) -> meta
DT::datatable(meta)
des_mat <- model.matrix(~ meta$Day)
fit <- limma::lmFit(t(module_eigengenes), design = des_mat)
fit <- limma::eBayes(fit)

stats_df <- limma::topTable(fit, number = ncol(module_eigengenes)) %>%
  tibble::rownames_to_column("module")

DT::datatable(stats_df)

Let’s make plot of module 21

module_df <- module_eigengenes %>%
  tibble::rownames_to_column("ID_New") %>%
  dplyr::inner_join(meta %>%
                      dplyr::select(ID_New, Day, Time, New_Diet),
                    by = c("ID_New" = "ID_New"))


ggplot(module_df, aes(x = Time, y = ME21, color = Day)) +
  geom_boxplot(width = 0.2, outlier.shape = NA) +
  ggforce::geom_sina(maxwidth = 0.3) +
  theme_classic() + facet_wrap("New_Diet")

ggplot(module_df, aes(x = Day, y = ME21, color = Time)) +
  geom_boxplot(width = 0.2, outlier.shape = NA) +
  ggforce::geom_sina(maxwidth = 0.3) +
  theme_classic()

What genes are a part of module 21?

gene_module_key <- tibble::enframe(modules.omics_X$colors, name = "gene", value = "module") %>%
  # Let's add the `ME` part so its more clear what these numbers are and it matches elsewhere
  dplyr::mutate(module = paste0("ME", module))

# gene_module_key %>%
#   dplyr::filter(module == "ME21") 
DT::datatable(gene_module_key)

Functional analysis

The code in R connects to the AnnotationHub package and selects OrgDb annotations for the species “Salmo salar”. It creates two objects, SsalOrg, by subsetting and extracting the desired annotations from the ah object.

# get OrgDb from AnnotationHub
ah <- AnnotationHub()
SsalOrg <- subset(ah, ah$species == "Salmo salar" & ah$rdataclass=="OrgDb")
SsalOrg <- ah[["AH107424"]]

This script performs gene enrichment analysis on multiple modules of genes. The modules of genes are generated from omics data, and the gene annotations are read from a csv file. The probes from the omics data are matched with the gene IDs from the annotations, and the LocusLink IDs are obtained. The unique modules are then selected, and their LocusLink IDs are written into separate text files. Finally, gene ontology (GO) and KEGG pathway analysis are performed for each module using the enrichGO and enrichKEGG functions, respectively. The results are then visualized using the dotplot function.

moduleLabels <- modules.omics_X$colors
moduleColors <- labels2colors(modules.omics_X$colors)
table(moduleColors)
## moduleColors
##         black          blue         brown          cyan     darkgreen 
##          1311          2480          2192           692           174 
##      darkgrey       darkred darkturquoise         green   greenyellow 
##           118           177           130          1542           864 
##          grey        grey60     lightcyan    lightgreen   lightyellow 
##         13045           460           480           264           248 
##       magenta  midnightblue          pink        purple           red 
##           937           676          1029           897          1359 
##     royalblue        salmon           tan     turquoise        yellow 
##           225           702           846          4277          1753
# table(moduleLabels)
# table(moduleColors, moduleLabels)

MEs <- modules.omics_X$MEs
geneTree <- modules.omics_X$dendrograms[[1]]


#https://gitlab.com/garethgillard/ssalv3annotation
annot1 <- read.csv("/Users/shashankgupta/Desktop/ImprovAFish/github/Input/Salmo_salar-GCA_905237065.2_gene_annotations.csv", header = TRUE)

probes <- colnames(omics_data_host) 
probes2annot <- match(probes, annot1$gene_id)
allLLIDs <- annot1$v2.gene_id.NCBI[probes2annot]

intModules <- unique(moduleColors)
for (module in intModules) 
{
  # Select module probes 
  modGenes = (moduleColors==module) 
  # Get their entrez ID codes 
  modLLIDs = allLLIDs[modGenes]
  # Write them into a file 
  fileName = paste("LocusLinkIDs-", module, ".txt", sep=""); 
  write.table(as.character(modLLIDs), file = fileName, row.names = FALSE, col.names = FALSE) 
}
fileName = paste("LocusLinkIDs-all.txt", sep="");
write.table(as.character(allLLIDs), file = fileName, row.names = FALSE, col.names = FALSE)

GO over-representation analysis

setwd("/Users/shashankgupta/Desktop/ImprovAFish/github/ImprovaFish/Transcriptomics/intermediate_files/Gut/")
all.df <- read.table("LocusLinkIDs-all.txt")
all.df$V1 <- as.character(all.df$V1)
# Get unique module colors
list_names <- unique(moduleColors)

for (i in list_names) {
  # Read LocusLinkIDs file
  assign(paste0(i,"_df"), read.table(paste0("LocusLinkIDs-",i,".txt")))
  # Enrich GO terms
  assign(paste0("resGO_",i), enrichGO(gene = get(paste0(i,"_df"))$V1, ont = "BP", universe = all.df$V1, OrgDb = SsalOrg, pvalueCutoff  = 0.05,qvalueCutoff  = 0.05))
  # Plot GO terms using dotplot
  assign(paste0("resGO_dotplot_",i), dotplot(get(paste0("resGO_",i))))
}

Save all the plots

resGO_dot_plots <- ls(pattern="resGO_dotplot_*")
for (i in 1:length(resGO_dot_plots)) {
  tryCatch({
    png(paste0(resGO_dot_plots[i], ".png"))
    print(get(resGO_dot_plots[i]))
    dev.off()
  }, error = function(e) {
    #print(paste("Error printing", resGO_dot_plots[i], ":", e))
    file.remove(paste0(resGO_dot_plots[i], ".png"))
  })
}
png_files <- list.files(pattern = "^resGO_dotplot_.*\\.png$")
raster_list <- lapply(png_files, readPNG)
plot <- grid.arrange(grobs = lapply(raster_list, rasterGrob), ncol = 3)
<div style="text-align:center; color:red; font-size: 14pt;">Gene Ontology (GO) analysis</div>

Gene Ontology (GO) analysis

dev.off()
## quartz_off_screen 
##                 3
moduleColors <- modules.omics_X$colors
mergedMEs <- modules.omics_X$MEs
colorOrder = c("grey", standardColors(50));
moduleLabels = match(moduleColors, colorOrder)-1;
MEs <- mergedMEs
MEs <- MEs[ , -which(names(MEs) %in% c("ME0"))]
modNames <- substring(names(MEs), 3)

# Correlating each genes expression profile with the module eigengenes in order to create module gene sets
geneModuleMembership <- as.data.frame(cor(omics_data_host, MEs, use = "p"))

# Iteratively creating a list of module genesets to test. These are in ensembl ids
moduleGeneSets<-lapply(modNames,function(module){
  column = match(module, modNames)
  moduleGenes = moduleColors==module
  rownames(geneModuleMembership[moduleGenes,])
})
names(moduleGeneSets)<-modNames

# Trimming the module gene sets so that the final two digits after the "." are removed
moduleGeneSets.trimmed<-lapply(moduleGeneSets,function(x){
  str_split_fixed(x,"\\.",2)[,1]
})

moduleGeneSets.Entrez <- lapply(moduleGeneSets.trimmed, function(x) {
  x <- match(x, annot1$gene_id)
  x <- annot1$v2.gene_id.NCBI[x]
  return(x)
})

ck<-compareCluster(geneCluster=moduleGeneSets.Entrez,
                   fun="enrichGO",
                   OrgDb = SsalOrg,
                   pvalueCutoff  = 0.05,
                   qvalueCutoff  = 0.05, 
                   ont = "BP",
                   universe = all.df$V1)
dotplot(ck)

cols  <- c(brewer.pal(8,"Set1"), brewer.pal(7,"Dark2"),brewer.pal(7,"Set2"),brewer.pal(12,"Set3"),brewer.pal(7,"Accent"),brewer.pal(12,"Paired"),"gray")

g<- cnetplot(ck,node_label="category")
g + scale_fill_manual(values =cols)