library(GO.db)
library(GenomicFeatures)
library(AnnotationDbi)
library(DESeq2)
library(dbplyr)
library(WGCNA)
library(DT)
library(clusterProfiler)
library(dplyr)
library(AnnotationHub)
library(ggplot2)
library(tibble)
library(readr)
library(png)
library(gridExtra)
library(grid)
library(stringr)
library(RColorBrewer)
library(plyr)
library(limma)
Set seed as a precaution for reproducibility as some methods are non-deterministic.
set.seed(13118)
Run_analysis <- TRUE # if FALSE it tries to load data instead of running the module creation step.
prefix <- "h"
save_TOM <- FALSE # Can get rather big
pam_stage <- FALSE # Partitioning around medoids, tries to put more genes into modules where it is not directly clear from the dendrogram cutting.
set_verbose = 1 # How much detail from the WGCNA functions? A higher number means more detail.
assoc_measure = "bicor"
plot_labeling_size = 15
theme_set(theme_bw())
Loop to read in the raw count files and store them in a list
rawCounts_list <- list()
for (i in 1:4) {
file_path <- paste0("~/Desktop/salmon_project/Gene_Counts/featurecounts_X204SC21043349-Z02-F001_", i, "_genome")
rawCounts_list[[i]] <- read.delim(file_path)
}
rawCounts_total <- join_all(rawCounts_list, by = 'Geneid', type = 'full')
Loop to simplify the code for removing unnecessary information from column names:
patterns_to_remove <- c(
"X.home.projects.ku.cbd.data.HoloFish.IMPROVAFISH.ANALYSES_HostRNA.5_star_2pass.X204SC21043349.Z02.F001_1.",
"X.home.projects.ku.cbd.data.HoloFish.IMPROVAFISH.ANALYSES_HostRNA.5_star_2pass.X204SC21043349.Z02.F001_2.",
"X.home.projects.ku.cbd.data.HoloFish.IMPROVAFISH.ANALYSES_HostRNA.5_star_2pass.X204SC21043349.Z02.F001_3.",
"X.home.projects.ku.cbd.data.HoloFish.IMPROVAFISH.ANALYSES_HostRNA.5_star_2pass.X204SC21043349.Z02.F001_4.",
"_Aligned.sortedByCoord.out.bam",
"\\..*"
)
for (pattern in patterns_to_remove) {
colnames(rawCounts_total) <- gsub(pattern, "", colnames(rawCounts_total))
}
Loop to remove columns containing specific strings:
columns_to_remove <- c("Chr", "Start", "End", "BLK", "CL1", "CP1", "CP2", "CP3")
column_indices_to_remove <- which(names(rawCounts_total) %in% columns_to_remove)
rawCounts_total <- rawCounts_total[, -column_indices_to_remove]
rawCounts_total <- data.frame(rawCounts_total[, -1], row.names = rawCounts_total[, 1])
Finally, for renaming row and column names. Also main idea of this
chunk of code is make sure we have same number of samples in
countData and samples in colData for
DESeq2. Also the order of the samples in
colData need to match the order of the samples in
countData
mapping_file <- read.table("~/Desktop/salmon_project/SampleData.txt", header = T, sep = "\t", row.names = 1)
mapping_file <- subset(mapping_file, !Sample_Type %in% c("BLK" ,"CL1", "CP1", "CP2", "CP3", "Gills", "Hind_Gut"))
mapping_file$New_Diet <- toupper(mapping_file$New_Diet)
mapping_file$common <- rownames(mapping_file)
mapping_file$common <- gsub("_Lh0", "_", mapping_file$common)
rownames(mapping_file) <- mapping_file$common
colnames(rawCounts_total) <- gsub("_Lh0", "_", colnames(rawCounts_total))
rows_to_remove <- c("T0")
mapping_file <- mapping_file[!mapping_file$Time %in% rows_to_remove,]
rawCounts_total <- rawCounts_total[, !grepl("T0_ctr_", names(rawCounts_total))]
reorder_idx <- match(rownames(mapping_file), colnames(rawCounts_total))
rawCounts_total <- rawCounts_total[, reorder_idx]
all(colnames(rawCounts_total) == rownames(mapping_file))
## [1] TRUE
omics_data_host_Liver <- rawCounts_total
sample_info <- mapping_file
sample_info$common <- rownames(sample_info)
#sample_info$New_Diet <- tolower(sample_info$New_Diet)
#Rename the sample names based on new diet
sample_info$ID_New <- paste(sample_info$Time, "_",
sample_info$New_Diet, "_",
sample_info$Tank_number, "_", "0",
sample_info$Sample_Number,
sep = "")
head_change <- subset(sample_info, select=c("common" ,"ID_New"))
row.names(sample_info) <- sample_info$ID_New
omics_data_host_Liver1 <- omics_data_host_Liver[, colnames(omics_data_host_Liver) %in% sample_info$common]
omics_data_host_Liver <- omics_data_host_Liver1
all(colnames(omics_data_host_Liver) == sample_info$common)
## [1] TRUE
# rename colnames of omics_data_host_Liver using new ID
df1 <- subset(sample_info, select= c("common", "ID_New"))
new_colnames <- df1$ID_New[match(colnames(omics_data_host_Liver), df1$common)]
colnames(omics_data_host_Liver) <- ifelse(is.na(new_colnames), colnames(omics_data_host_Liver), new_colnames)
all(colnames(omics_data_host_Liver) == sample_info$ID_New)
## [1] TRUE
df <- round(omics_data_host_Liver) %>%
# The next steps require a data frame and round() returns a matrix
as.data.frame() %>%
# Only keep rows that have total counts above the cutoff
dplyr::filter(rowSums(.) >= 50)
all(colnames(df) == rownames(sample_info))
## [1] TRUE
dds_liver <- DESeqDataSetFromMatrix(
countData = df, # Our prepped data frame with counts
colData = sample_info, # Data frame with annotation for our samples
design = ~1 # Here we are not specifying a model
)
dds_liver$group <- factor(paste0(dds_liver$Time, dds_liver$New_Diet))
design(dds_liver) <- ~ group-1
dds_liver <- DESeq(dds_liver, parallel = T)
resultsNames(dds_liver)
## [1] "groupT1CTR" "groupT1MC1" "groupT1MC2" "groupT1MN3" "groupT2CTR"
## [6] "groupT2MC1" "groupT2MC2" "groupT2MN3" "groupT3CTR" "groupT3MC1"
## [11] "groupT3MC2" "groupT3MN3"
annot_function <- read_tsv("/Users/shashankgupta/Desktop/ImprovAFish/Salmo_salar-GCA_905237065.2_gene_annotations.tsv")
annot_function <- annot_function[, c("gene_id", "v2.gene_id.NCBI","v2.gene_name.ensembl", "v2.product")]
res <- results(dds_liver,
contrast = list("groupT1MC1","groupT1CTR"))
table(res$padj < 0.05)
##
## FALSE TRUE
## 30197 71
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
filter(padj <0.05)%>%
arrange(padj)
res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )
DT::datatable(res_tbl)
# res_tbl %>%
# filter(log2FoldChange > 0) %>%
# select(v2.product)
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements
res <- results(dds_liver,
contrast = list("groupT1MC2","groupT1CTR"))
table(res$padj < 0.05)
##
## FALSE TRUE
## 27600 12
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
filter(padj <0.05)%>%
arrange(padj)
res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )
DT::datatable(res_tbl)
# res_tbl %>%
# filter(log2FoldChange > 0) %>%
# select(v2.product)
#
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements
res <- results(dds_liver,
contrast = list("groupT1MN3","groupT1CTR"))
table(res$padj < 0.05)
##
## FALSE TRUE
## 34250 3
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
filter(padj <0.05)%>%
arrange(padj)
res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )
DT::datatable(res_tbl)
# res_tbl %>%
# filter(log2FoldChange > 0) %>%
# select(v2.product)
#
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements
# T2
res <- results(dds_liver,
contrast = list("groupT2MC1","groupT2CTR"))
table(res$padj < 0.05)
##
## FALSE TRUE
## 34251 2
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
filter(padj <0.05)%>%
arrange(padj)
res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )
DT::datatable(res_tbl)
#
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements
res <- results(dds_liver,
contrast = list("groupT2MC2","groupT2CTR"))
table(res$padj < 0.05)
##
## FALSE TRUE
## 18974 5
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
filter(padj <0.05)%>%
arrange(padj)
res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )
DT::datatable(res_tbl)
#
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements
res <- results(dds_liver,
contrast = list("groupT2MN3","groupT2CTR"))
table(res$padj < 0.05)
##
## FALSE TRUE
## 34251 2
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
filter(padj <0.05)%>%
arrange(padj)
res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )
DT::datatable(res_tbl)
#
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements
res <- results(dds_liver,
contrast = list("groupT3MC1","groupT3CTR"))
table(res$padj < 0.05)
##
## FALSE TRUE
## 29413 191
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
filter(padj <0.05)%>%
arrange(padj)
res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )
DT::datatable(res_tbl)
# res_tbl %>%
# filter(log2FoldChange > 0) %>%
# select(v2.product)
#
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements
res <- results(dds_liver,
contrast = list("groupT3MC2","groupT3CTR"))
table(res$padj < 0.05)
##
## FALSE TRUE
## 27983 293
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
filter(padj <0.05)%>%
arrange(padj)
res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )
DT::datatable(res_tbl)
# res_tbl %>%
# filter(log2FoldChange > 0) %>%
# select(v2.product)
#
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements
res <- results(dds_liver,
contrast = list("groupT3MN3","groupT3CTR"))
table(res$padj < 0.05)
##
## FALSE TRUE
## 34245 8
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
filter(padj <0.05)%>%
arrange(padj)
res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )
DT::datatable(res_tbl)
# res_tbl %>%
# filter(log2FoldChange > 0) %>%
# select(v2.product)
#
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements
# Create the dataframe with the required columns
df <- data.frame(Group = character(),
"DEGs (up-regulated)" = integer(),
"DEGs (down-regulated)" = integer(),
"DEGs (up)-v2.products" = character(),
"DEGs (down)-v2.products" = character())
# Loop through each group and store the results in the dataframe
for (treatment in c("T1", "T2", "T3")) {
for (group in c("MC1", "MC2", "MN3")) {
sample_name <- paste0("group", treatment, group)
CTR_sample <- paste0("group", treatment, "CTR")
# Calculate results
res <- results(dds_liver, contrast = list(sample_name, CTR_sample))
# Filter the results and count the number of positive and negative elements
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
filter(padj < 0.05) %>%
arrange(padj)
res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )
# Separate the up-regulated and down-regulated genes
up_genes <- res_tbl$log2FoldChange > 0
down_genes <- res_tbl$log2FoldChange < 0
up_regulated <- sum(up_genes)
down_regulated <- sum(down_genes)
# Store the results in the dataframe
df <- rbind(df, c(sample_name, up_regulated, down_regulated,
paste0(res_tbl$v2.product[up_genes], collapse = ", "),
paste0(res_tbl$v2.product[down_genes], collapse = ", ")))
}
}
# Rename the columns
names(df) <- c("Groups",
"DEGs (up-regulated)",
"DEGs (down-regulated)",
"DEGs (up)-v2.products",
"DEGs (down)-v2.products")
DT::datatable(df)
omics_data_host <- t(assay(vst(dds_liver)))
dim(omics_data_host) %>% paste(c("Samples", "Genes"))
## [1] "144 Samples" "34256 Genes"
powers <- c(c(1:10), seq(from = 12, to=20, by=2))
sft <- pickSoftThreshold(omics_data_host,
powerVector = powers,
verbose = set_verbose,
networkType = "signed",
corFn= "bicor")
idx <- min(which((-sign(sft$fitIndices[,3])*sft$fitIndices[,2]) > 0.90))
if(is.infinite(idx)){
idx <- min(which((-sign(sft$fitIndices[,3])*sft$fitIndices[,2]) > 0.80))
if(!is.infinite(idx)){
st <- sft$fitIndices[idx,1]
} else{
idx <- which.max(-sign(sft$fitIndices[,3])*sft$fitIndices[,2])
st <- sft$fitIndices[idx,1]
}
} else{
st <- sft$fitIndices[idx,1]
}
data.frame(Indices = sft$fitIndices[,1],
sfApprox = -sign(sft$fitIndices[,3])*sft$fitIndices[,2]) %>%
ggplot() +
geom_hline(yintercept = 0.9, color = "red", alpha = 0.6) + # corresponds to R^2 cut-off of 0.9
geom_hline(yintercept = 0.8, color = "red", alpha = 0.2) + # corresponds to R^2 cut-off of 0.8
geom_line(aes(x = Indices, y = sfApprox), color = "red", alpha = 0.1, size = 2.5) +
geom_text(mapping = aes(x = Indices, y = sfApprox, label = Indices), color = "red", size = 4) +
ggtitle("Scale independence") +
xlab("Soft Threshold (power)") +
ylab("SF Model Fit,signed R^2") +
xlim(1,20) +
ylim(-1,1) + theme_bw() +
geom_segment(aes(x = st, y = 0.25, xend = st, yend = sfApprox[idx]-0.05),
arrow = arrow(length = unit(0.2,"cm")),
size = 0.5)-> scale_independence_plot
data.frame(Indices = sft$fitIndices[,1],
meanApprox = sft$fitIndices[,5]) %>%
ggplot() +
geom_line(aes(x = Indices, y = meanApprox), color = "red", alpha = 0.1, size = 2.5) +
geom_text(mapping = aes(x = Indices, y = meanApprox, label = Indices), color = "red", size = 4) +
xlab("Soft Threshold (power)") +
ylab("Mean Connectivity") +theme_bw() +
geom_segment(aes(x = st-0.4,
y = sft$fitIndices$mean.k.[idx],
xend = 0,
yend = sft$fitIndices$mean.k.[idx]),
arrow = arrow(length = unit(0.2,"cm")),
size = 0.4) +
ggtitle(paste0("Mean connectivity: ",
round(sft$fitIndices$mean.k.[idx],2))) -> mean_connectivity_plot
cowplot::plot_grid(scale_independence_plot, mean_connectivity_plot, ncol = 2, align = "h", labels = c("A", "B"), label_size = plot_labeling_size) -> si_mc_plot
si_mc_plot
modules.omics_X_Liver <- blockwiseModules(omics_data_host,
power = st,
networkType = "signed",
TOMType = "signed",
corType = assoc_measure,
#maxPOutliers = 0.05,
#deepSplit = 4, # Default 2
minModuleSize = 20, # 30
#minCoreKME = 0.5, # Default 0.5
#minCoreKMESize = 2, # Default minModuleSize/3,
#minKMEtoStay = 0.5, # Default 0.3
#reassignThreshold = 0, # Default 1e-6
#mergeCutHeight = 0.2, # Default 0.15
#pamStage = pam_stage,
#pamRespectsDendro = TRUE,
#replaceMissingAdjacencies = TRUE,
nThreads = 10,
numericLabels = TRUE,
saveTOMs = FALSE,
saveTOMFileBase = "HOST_TOM",
verbose = 1,
maxBlockSize=8000)
hubs <- chooseTopHubInEachModule(omics_data_host, modules.omics_X_Liver$colors, power = st, omitColors = "0")
stage2results_X_Liver <- list(modules = modules.omics_X_Liver,
hubs = hubs)
saveRDS(stage2results_X_Liver, "/Users/shashankgupta/Desktop/ImprovAFish/github/ImprovaFish/OmicsIntegration/stage2results_Transcriptomics_Liver.rds")
# Convert labels to colors for plotting
merged_colors <- labels2colors(stage2results_X_Liver$modules$colors)
n_modules <- unique(merged_colors) %>% length()
samples_good <- sum(stage2results_X_Liver$modules$goodSamples) == length(stage2results_X_Liver$modules$goodSamples)
genes_good <- sum(stage2results_X_Liver$modules$goodGenes) == length(stage2results_X_Liver$modules$goodGenes)
ME_good <- sum(stage2results_X_Liver$modules$MEsOK) == length(stage2results_X_Liver$modules$MEsOK)
table(stage2results_X_Liver$modules$colors) %>%
as.data.frame() %>%
dplyr::rename(Module = Var1, Size = Freq) %>%
dplyr::mutate(Module_color = labels2colors(as.numeric(as.character(Module)))) -> module_size
module_size %>%
ggplot(aes(x = Module, y = Size, fill = Module)) +
geom_col(color = "#000000") +
ggtitle("Number of genes in each module") +
theme(legend.position = "none") +
scale_fill_manual(values = setNames(module_size$Module_color,module_size$Module)) +
geom_text(aes(label = Size),vjust = 0.5, hjust = -0.18, size = 3.5) +
ylim(0, max(module_size$Size)*1.1) + theme_bw() +
theme(plot.margin = margin(2, 2, 2, 2, "pt")) +
coord_flip() -> module_size_barplot
module_size_barplot + theme(legend.position = "none") -> module_size_barplot
module_size_barplot
MEs <- stage2results_X_Liver$modules$MEs
# Module correlation to other modules
MEs_R <- bicor(MEs, MEs, maxPOutliers = 0.05)
idx.r <- which(rownames(MEs_R) == "ME0")
idx.c <- which(colnames(MEs_R) == "ME0")
MEs_R_noME0 <- MEs_R[-idx.r, -idx.c]
MEs_R_noME0[upper.tri(MEs_R_noME0)] %>%
as.data.frame() %>%
dplyr::rename("correlation" = ".") %>%
ggplot(aes(correlation)) +
geom_histogram(bins = 20) +
#geom_density() +
xlim(-1, 1) +
ggtitle(paste0(prefix,"ME correlation\n w/o ",prefix ,"ME0")) -> MEs_R_density
pheatmap::pheatmap(MEs_R_noME0, color = colorRampPalette(c("Blue", "White", "Red"))(100),
silent = T,
breaks = seq(-1,1,length.out = 101),
treeheight_row = 10,
treeheight_col = 10,
main = paste0(prefix,"ME correlation heatmap w/o ", prefix ,"ME0"),
labels_row = rep("", length(rownames(MEs_R))), #paste0(prefix, rownames(MEs_R)),
labels_col = rep("", length(colnames(MEs_R))) #paste0(prefix, colnames(MEs_R))
) -> MEs_R_Corr
cowplot::plot_grid(MEs_R_density, MEs_R_Corr$gtable, labels = c("D", "E"), label_size = plot_labeling_size, rel_widths = c(0.6, 1)) -> density_eigen
all(rownames(omics_data_host) == rownames(MEs))
## [1] TRUE
dim(omics_data_host) %>% paste0(c(" samples", " genes"))
## [1] "144 samples" "34256 genes"
kME <- bicor(omics_data_host, MEs, maxPOutliers = 0.05)
dim(kME) %>% paste0(c(" genes", " modules"))
## [1] "34256 genes" "77 modules"
# Corr within modules
corr_within_module <- function(omics_data_host, modules, module_x = 1){
idx.omics_data_host <- which(modules$colors == module_x)
idx.me <- which(colnames(modules$MEs) == paste0("ME",module_x))
kME_x <- bicor(omics_data_host[,idx.omics_data_host], modules$MEs[,idx.me], maxPOutliers = 0.05)
kME_x
}
ggplot.list <- list()
modules <-
colnames(stage2results_X_Liver$modules$MEs)[colnames(stage2results_X_Liver$modules$MEs) %>% sub("ME", "", .) %>% as.numeric() %>% order()]
for(m in colnames(stage2results_X_Liver$modules$MEs)){
h <- as.numeric(sub("ME","", m))
data.frame(x = suppressWarnings(corr_within_module(omics_data_host = omics_data_host, modules = stage2results_X_Liver$modules, module_x = h))) %>%
ggplot() +
#geom_density(aes(x = x), fill = labels2colors(h), color = "black", alpha = 0.5) +
geom_histogram(aes(x), fill = labels2colors(h), color = "black", alpha = 0.5, bins = 20) +
xlim(-1, 1) +
xlab("Gene correlation") +
ggtitle(paste0(prefix,m)) -> da_plot
ggplot.list[[m]] <- da_plot
}
ggplot.list <- ggplot.list[ggplot.list %>% names() %>% sub("ME", "", .) %>% as.numeric() %>% order()]
cowplot::plot_grid(plotlist = ggplot.list) -> density_all_plot
# comine to one plot
cowplot::plot_grid(si_mc_plot , density_eigen, ncol = 1, rel_heights = c(0.8,1)) -> part_1
cowplot::plot_grid(part_1, module_size_barplot, labels = c("", "C"), label_size = plot_labeling_size, rel_widths = c(1,0.5)) -> part_2
cowplot::plot_grid(part_2, density_all_plot, ncol = 1, rel_heights = c(0.8,1), labels = c("", "F"), label_size = plot_labeling_size) -> part_3
part_3
This code takes the results of a omics data and extracts the “hubs”, or most highly connected genes in the network, for each module.
stage2results_X_Liver$hubs %>%
as.data.frame() %>%
dplyr::rename("gene_id" = ".") %>%
tibble::rownames_to_column(var = "Module") -> top_genes
DT::datatable(top_genes)
Linear regression analysis on the module eigengenes (principal components) from omics data, using the limma package. The sample information is loaded and the factors ‘Time’, ‘New_Diet’, and ‘Day’ are created based on the sample IDs. The design matrix for the regression is created using the ‘Day’ factor, and a linear model fit is performed on the transposed module eigengenes. The regression results are then run through an empirical Bayesian analysis. The topTable function is used to extract the regression statistics and create a data frame with the module names as the row names.
#modules.omics_X_Liver.backup <- modules.omics_X_Liver
module_eigengenes <- modules.omics_X_Liver$MEs
all.equal(sample_info$ID_New, rownames(module_eigengenes))
## [1] TRUE
sample_info %>%
mutate(Time = factor(Time)) %>%
mutate(New_Diet = factor(New_Diet)) %>%
mutate(Day = ifelse(Time == "T1", paste0("T1_", New_Diet),
ifelse(Time == "T2", paste0("T2_", New_Diet), paste0("T3_", New_Diet)))) -> meta
DT::datatable(meta)
des_mat <- model.matrix(~ meta$Day)
fit <- limma::lmFit(t(module_eigengenes), design = des_mat)
fit <- limma::eBayes(fit)
stats_df <- limma::topTable(fit, number = ncol(module_eigengenes)) %>%
tibble::rownames_to_column("module")
DT::datatable(stats_df)
Let’s make plot of module 17
module_df <- module_eigengenes %>%
tibble::rownames_to_column("ID_New") %>%
dplyr::inner_join(meta %>%
dplyr::select(ID_New, Day, Time, New_Diet),
by = c("ID_New" = "ID_New"))
ggplot(module_df, aes(x = Time, y = ME17, color = Day)) +
geom_boxplot(width = 0.2, outlier.shape = NA) +
ggforce::geom_sina(maxwidth = 0.3) +
theme_classic() + facet_wrap("New_Diet")
ggplot(module_df, aes(x = Day, y = ME17, color = Time)) +
geom_boxplot(width = 0.2, outlier.shape = NA) +
ggforce::geom_sina(maxwidth = 0.3) +
theme_classic()
What genes are a part of module 21?
gene_module_key <- tibble::enframe(modules.omics_X_Liver$colors, name = "gene", value = "module") %>%
# Let's add the `ME` part so its more clear what these numbers are and it matches elsewhere
dplyr::mutate(module = paste0("ME", module))
# gene_module_key %>%
# dplyr::filter(module == "ME21")
DT::datatable(gene_module_key)
The code in R connects to the AnnotationHub package and selects OrgDb annotations for the species “Salmo salar”. It creates two objects, SsalOrg, by subsetting and extracting the desired annotations from the ah object.
# get OrgDb from AnnotationHub
ah <- AnnotationHub()
SsalOrg <- subset(ah, ah$species == "Salmo salar" & ah$rdataclass=="OrgDb")
SsalOrg <- ah[["AH107424"]]
This script performs gene enrichment analysis on multiple modules of genes. The modules of genes are generated from omics data, and the gene annotations are read from a csv file. The probes from the omics data are matched with the gene IDs from the annotations, and the LocusLink IDs are obtained. The unique modules are then selected, and their LocusLink IDs are written into separate text files. Finally, gene ontology (GO) and KEGG pathway analysis are performed for each module using the enrichGO and enrichKEGG functions, respectively. The results are then visualized using the dotplot function.
moduleLabels <- modules.omics_X_Liver$colors
moduleColors <- labels2colors(modules.omics_X_Liver$colors)
table(moduleColors)
## moduleColors
## antiquewhite4 bisque4 black blue blue2
## 45 64 416 790 20
## brown brown2 brown4 coral1 coral2
## 649 21 70 46 40
## cyan darkgreen darkgrey darkmagenta darkolivegreen
## 306 206 192 107 111
## darkolivegreen4 darkorange darkorange2 darkred darkseagreen4
## 22 167 73 207 47
## darkslateblue darkturquoise firebrick4 floralwhite green
## 58 200 23 74 485
## greenyellow grey grey60 honeydew1 indianred4
## 373 21281 258 47 23
## ivory lavenderblush3 lightcoral lightcyan lightcyan1
## 75 48 31 260 87
## lightgreen lightpink4 lightsteelblue lightsteelblue1 lightyellow
## 246 48 31 89 230
## magenta maroon mediumorchid mediumpurple2 mediumpurple3
## 396 49 40 32 91
## midnightblue navajowhite2 orange orangered3 orangered4
## 289 49 172 36 92
## paleturquoise palevioletred3 pink plum plum1
## 134 49 409 36 95
## plum2 purple red royalblue saddlebrown
## 51 387 433 214 152
## salmon salmon4 sienna3 skyblue skyblue1
## 336 50 105 152 37
## skyblue2 skyblue3 steelblue tan thistle1
## 40 100 151 353 51
## thistle2 turquoise violet white yellow
## 51 1014 113 159 632
## yellow4 yellowgreen
## 39 101
# table(moduleLabels)
# table(moduleColors, moduleLabels)
MEs <- modules.omics_X_Liver$MEs
geneTree <- modules.omics_X_Liver$dendrograms[[1]]
#https://gitlab.com/garethgillard/ssalv3annotation
annot1 <- read.csv("/Users/shashankgupta/Desktop/ImprovAFish/github/Input/Salmo_salar-GCA_905237065.2_gene_annotations.csv", header = TRUE)
probes <- colnames(omics_data_host)
probes2annot <- match(probes, annot1$gene_id)
allLLIDs <- annot1$v2.gene_id.NCBI[probes2annot]
intModules <- unique(moduleColors)
for (module in intModules)
{
# Select module probes
modGenes = (moduleColors==module)
# Get their entrez ID codes
modLLIDs = allLLIDs[modGenes]
# Write them into a file
fileName = paste("LocusLinkIDs-", module, ".txt", sep="");
write.table(as.character(modLLIDs), file = fileName, row.names = FALSE, col.names = FALSE)
}
fileName = paste("LocusLinkIDs-all.txt", sep="");
write.table(as.character(allLLIDs), file = fileName, row.names = FALSE, col.names = FALSE)
setwd("/Users/shashankgupta/Desktop/ImprovAFish/github/ImprovaFish/Transcriptomics/intermediate_files/Liver/")
all.df <- read.table("LocusLinkIDs-all.txt")
all.df$V1 <- as.character(all.df$V1)
# Get unique module colors
list_names <- unique(moduleColors)
for (i in list_names) {
# Read LocusLinkIDs file
assign(paste0(i,"_df"), read.table(paste0("LocusLinkIDs-",i,".txt")))
# Enrich GO terms
assign(paste0("resGO_",i), enrichGO(gene = get(paste0(i,"_df"))$V1, ont = "BP", universe = all.df$V1, OrgDb = SsalOrg, pvalueCutoff = 0.05,qvalueCutoff = 0.05))
# Plot GO terms using dotplot
assign(paste0("resGO_dotplot_",i), dotplot(get(paste0("resGO_",i))))
}
Save all the plots
resGO_dot_plots <- ls(pattern="resGO_dotplot_*")
for (i in 1:length(resGO_dot_plots)) {
tryCatch({
png(paste0(resGO_dot_plots[i], ".png"))
print(get(resGO_dot_plots[i]))
dev.off()
}, error = function(e) {
#print(paste("Error printing", resGO_dot_plots[i], ":", e))
file.remove(paste0(resGO_dot_plots[i], ".png"))
})
}
png_files <- list.files(pattern = "^resGO_dotplot_.*\\.png$")
raster_list <- lapply(png_files, readPNG)
plot <- grid.arrange(grobs = lapply(raster_list, rasterGrob), ncol = 3)
dev.off()
## quartz_off_screen
## 3
moduleColors <- modules.omics_X_Liver$colors
mergedMEs <- modules.omics_X_Liver$MEs
colorOrder = c("grey", standardColors(50));
moduleLabels = match(moduleColors, colorOrder)-1;
MEs <- mergedMEs
MEs <- MEs[ , -which(names(MEs) %in% c("ME0"))]
modNames <- substring(names(MEs), 3)
# Correlating each genes expression profile with the module eigengenes in order to create module gene sets
geneModuleMembership <- as.data.frame(cor(omics_data_host, MEs, use = "p"))
# Iteratively creating a list of module genesets to test. These are in ensembl ids
moduleGeneSets<-lapply(modNames,function(module){
column = match(module, modNames)
moduleGenes = moduleColors==module
rownames(geneModuleMembership[moduleGenes,])
})
names(moduleGeneSets)<-modNames
# Trimming the module gene sets so that the final two digits after the "." are removed
moduleGeneSets.trimmed<-lapply(moduleGeneSets,function(x){
str_split_fixed(x,"\\.",2)[,1]
})
moduleGeneSets.Entrez <- lapply(moduleGeneSets.trimmed, function(x) {
x <- match(x, annot1$gene_id)
x <- annot1$v2.gene_id.NCBI[x]
return(x)
})
ck<-compareCluster(geneCluster=moduleGeneSets.Entrez,
fun="enrichGO",
OrgDb = SsalOrg,
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
ont = "BP",
universe = all.df$V1)
dotplot(ck)
cols <- c(brewer.pal(8,"Set1"), brewer.pal(7,"Dark2"),brewer.pal(7,"Set2"),brewer.pal(12,"Set3"),brewer.pal(7,"Accent"),brewer.pal(12,"Paired"),"gray")
g<- cnetplot(ck,node_label="category")
g + scale_fill_manual(values =cols)