library(GO.db)
library(GenomicFeatures)
library(AnnotationDbi)
library(DESeq2)
library(dbplyr)
library(WGCNA)
library(DT)
library(clusterProfiler)
library(dplyr)
library(AnnotationHub)
library(ggplot2)
library(tibble)
library(readr)
library(png)
library(gridExtra)
library(grid)
library(stringr)
library(RColorBrewer)
dir.create("/Users/shashankgupta/Desktop/ImprovAFish/github/ImprovaFish/Transcriptomics/intermediate_files/Gut")
setwd("/Users/shashankgupta/Desktop/ImprovAFish/github/ImprovaFish/Transcriptomics/intermediate_files/Gut")
Set seed as a precaution for reproducibility as some methods are non-deterministic.
set.seed(13118)
Run_analysis <- TRUE # if FALSE it tries to load data instead of running the module creation step.
prefix <- "h"
save_TOM <- FALSE # Can get rather big
pam_stage <- FALSE # Partitioning around medoids, tries to put more genes into modules where it is not directly clear from the dendrogram cutting.
set_verbose = 1 # How much detail from the WGCNA functions? A higher number means more detail.
assoc_measure = "bicor"
plot_labeling_size = 15
theme_set(theme_bw())
host_gut_rawCounts_total1 <- read.table("/Users/shashankgupta/Desktop/ImprovAFish/host_data/host_gut_count.txt")
sample_info <- read.csv("/Users/shashankgupta/Desktop/ImprovAFish/ImprovAFish/sample_info.csv", row.names = 1)
omics_data_host <- host_gut_rawCounts_total1
host_gut_mapping_file <- sample_info
Main idea of this chunk of code is make sure we have same number of
samples in countData and samples in colData
for DESeq2. Also the order of the samples in
colData need to match the order of the samples in
countData
#sample_info$New_Diet <- tolower(sample_info$New_Diet)
#Rename the sample names based on new diet
sample_info$ID_New <- paste(sample_info$Time, "_",
sample_info$New_Diet, "_",
sample_info$Tank_number, "_", "0",
sample_info$Sample_Number,
sep = "")
head_change <- subset(sample_info, select=c("common" ,"ID_New"))
row.names(sample_info) <- sample_info$ID_New
omics_data_host1 <- omics_data_host[, colnames(omics_data_host) %in% sample_info$ID_New]
omics_data_host <- omics_data_host1
all(colnames(omics_data_host) == sample_info$ID_New)
## [1] FALSE
# rename colnames of omics_data_host using new ID
df1 <- subset(sample_info, select= c("common", "ID_New"))
omics_data_host %>%
rename_with(~deframe(df1)[.x], .cols = df1$ID_New) %>%
select(any_of(df1$ID_New)) -> omics_data_host_new
omics_data_host <- omics_data_host_new
all(colnames(omics_data_host) == sample_info$ID_New)
## [1] TRUE
reorder_idx <- match(sample_info$ID_New, colnames(omics_data_host))
omics_data_host <- omics_data_host[ , reorder_idx]
all(colnames(omics_data_host) == sample_info$ID_New)
## [1] TRUE
df <- round(omics_data_host) %>%
# The next steps require a data frame and round() returns a matrix
as.data.frame() %>%
# Only keep rows that have total counts above the cutoff
dplyr::filter(rowSums(.) >= 50)
all(colnames(df) == rownames(sample_info))
## [1] TRUE
dds <- DESeqDataSetFromMatrix(
countData = df, # Our prepped data frame with counts
colData = sample_info, # Data frame with annotation for our samples
design = ~1 # Here we are not specifying a model
)
dds$group <- factor(paste0(dds$Time, dds$New_Diet))
design(dds) <- ~ group-1
dds <- DESeq(dds, parallel = T)
resultsNames(dds)
## [1] "groupT1CTR" "groupT1MC1" "groupT1MC2" "groupT1MN3" "groupT2CTR"
## [6] "groupT2MC1" "groupT2MC2" "groupT2MN3" "groupT3CTR" "groupT3MC1"
## [11] "groupT3MC2" "groupT3MN3"
annot_function <- read_tsv("/Users/shashankgupta/Desktop/ImprovAFish/Salmo_salar-GCA_905237065.2_gene_annotations.tsv")
annot_function <- annot_function[, c("gene_id", "v2.gene_id.NCBI","v2.gene_name.ensembl", "v2.product")]
res <- results(dds,
contrast = list("groupT1MC1","groupT1CTR"))
table(res$padj < 0.05)
##
## FALSE TRUE
## 30430 12
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
filter(padj <0.05)%>%
arrange(padj)
res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )
DT::datatable(res_tbl)
# res_tbl %>%
# filter(log2FoldChange > 0) %>%
# select(v2.product)
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements
res <- results(dds,
contrast = list("groupT1MC2","groupT1CTR"))
table(res$padj < 0.05)
##
## FALSE TRUE
## 36872 5
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
filter(padj <0.05)%>%
arrange(padj)
res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )
DT::datatable(res_tbl)
# res_tbl %>%
# filter(log2FoldChange > 0) %>%
# select(v2.product)
#
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements
res <- results(dds,
contrast = list("groupT1MN3","groupT1CTR"))
table(res$padj < 0.05)
##
## FALSE TRUE
## 36870 7
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
filter(padj <0.05)%>%
arrange(padj)
res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )
DT::datatable(res_tbl)
# res_tbl %>%
# filter(log2FoldChange > 0) %>%
# select(v2.product)
#
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements
# T2
res <- results(dds,
contrast = list("groupT2MC1","groupT2CTR"))
table(res$padj < 0.05)
##
## FALSE TRUE
## 36872 5
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
filter(padj <0.05)%>%
arrange(padj)
res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )
DT::datatable(res_tbl)
#
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements
res <- results(dds,
contrast = list("groupT2MC2","groupT2CTR"))
table(res$padj < 0.05)
##
## FALSE TRUE
## 36875 2
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
filter(padj <0.05)%>%
arrange(padj)
res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )
DT::datatable(res_tbl)
#
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements
res <- results(dds,
contrast = list("groupT2MN3","groupT2CTR"))
table(res$padj < 0.05)
##
## FALSE TRUE
## 36876 1
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
filter(padj <0.05)%>%
arrange(padj)
res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )
DT::datatable(res_tbl)
#
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements
res <- results(dds,
contrast = list("groupT3MC1","groupT3CTR"))
table(res$padj < 0.05)
##
## FALSE TRUE
## 36873 4
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
filter(padj <0.05)%>%
arrange(padj)
res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )
DT::datatable(res_tbl)
# res_tbl %>%
# filter(log2FoldChange > 0) %>%
# select(v2.product)
#
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements
res <- results(dds,
contrast = list("groupT3MC2","groupT3CTR"))
table(res$padj < 0.05)
##
## FALSE TRUE
## 36135 27
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
filter(padj <0.05)%>%
arrange(padj)
res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )
DT::datatable(res_tbl)
# res_tbl %>%
# filter(log2FoldChange > 0) %>%
# select(v2.product)
#
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements
res <- results(dds,
contrast = list("groupT3MN3","groupT3CTR"))
table(res$padj < 0.05)
##
## FALSE TRUE
## 36870 7
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
filter(padj <0.05)%>%
arrange(padj)
res_tbl <- merge(res_tbl, annot_function, by.x = "ENSEMBL", by.y = "gene_id", all.x = TRUE )
DT::datatable(res_tbl)
# res_tbl %>%
# filter(log2FoldChange > 0) %>%
# select(v2.product)
#
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0]) # count positive elements
# length(res_tbl$log2FoldChange[res_tbl$log2FoldChange<0]) # count negative elements
# loop
# Create the dataframe
df <- data.frame(Group=character(), a1=character(), a2=character(), a3=character())
# Loop through each group and store the results in the dataframe
for(treatment in c("T1", "T2", "T3")){
for(group in c("MC1","MC2","MN3")){
sample_name <- paste0("group",treatment,group)
CTR_sample <- paste0("group",treatment,"CTR")
# Calculate results
res <- results(dds, contrast = list(sample_name,CTR_sample))
# Filter the results and count the number of positive elements
res_tbl <- as_tibble(res, rownames = "ENSEMBL") %>%
filter(padj <0.05)%>%
arrange(padj)
a <- length(res_tbl$log2FoldChange[res_tbl$log2FoldChange>0])
# Store the results in the dataframe
df <- rbind(df, c(sample_name,a))
}
}
names(df) <- c("Groups", "DEGs (up-regulated)")
df
## Groups DEGs (up-regulated)
## 1 groupT1MC1 4
## 2 groupT1MC2 2
## 3 groupT1MN3 3
## 4 groupT2MC1 1
## 5 groupT2MC2 1
## 6 groupT2MN3 0
## 7 groupT3MC1 3
## 8 groupT3MC2 24
## 9 groupT3MN3 5
omics_data_host <- t(assay(vst(dds)))
dim(omics_data_host) %>% paste(c("Samples", "Genes"))
## [1] "142 Samples" "36878 Genes"
powers <- c(c(1:10), seq(from = 12, to=20, by=2))
sft <- pickSoftThreshold(omics_data_host,
powerVector = powers,
verbose = set_verbose,
networkType = "signed",
corFn= "bicor")
idx <- min(which((-sign(sft$fitIndices[,3])*sft$fitIndices[,2]) > 0.90))
if(is.infinite(idx)){
idx <- min(which((-sign(sft$fitIndices[,3])*sft$fitIndices[,2]) > 0.80))
if(!is.infinite(idx)){
st <- sft$fitIndices[idx,1]
} else{
idx <- which.max(-sign(sft$fitIndices[,3])*sft$fitIndices[,2])
st <- sft$fitIndices[idx,1]
}
} else{
st <- sft$fitIndices[idx,1]
}
data.frame(Indices = sft$fitIndices[,1],
sfApprox = -sign(sft$fitIndices[,3])*sft$fitIndices[,2]) %>%
ggplot() +
geom_hline(yintercept = 0.9, color = "red", alpha = 0.6) + # corresponds to R^2 cut-off of 0.9
geom_hline(yintercept = 0.8, color = "red", alpha = 0.2) + # corresponds to R^2 cut-off of 0.8
geom_line(aes(x = Indices, y = sfApprox), color = "red", alpha = 0.1, size = 2.5) +
geom_text(mapping = aes(x = Indices, y = sfApprox, label = Indices), color = "red", size = 4) +
ggtitle("Scale independence") +
xlab("Soft Threshold (power)") +
ylab("SF Model Fit,signed R^2") +
xlim(1,20) +
ylim(-1,1) + theme_bw() +
geom_segment(aes(x = st, y = 0.25, xend = st, yend = sfApprox[idx]-0.05),
arrow = arrow(length = unit(0.2,"cm")),
size = 0.5)-> scale_independence_plot
data.frame(Indices = sft$fitIndices[,1],
meanApprox = sft$fitIndices[,5]) %>%
ggplot() +
geom_line(aes(x = Indices, y = meanApprox), color = "red", alpha = 0.1, size = 2.5) +
geom_text(mapping = aes(x = Indices, y = meanApprox, label = Indices), color = "red", size = 4) +
xlab("Soft Threshold (power)") +
ylab("Mean Connectivity") +theme_bw() +
geom_segment(aes(x = st-0.4,
y = sft$fitIndices$mean.k.[idx],
xend = 0,
yend = sft$fitIndices$mean.k.[idx]),
arrow = arrow(length = unit(0.2,"cm")),
size = 0.4) +
ggtitle(paste0("Mean connectivity: ",
round(sft$fitIndices$mean.k.[idx],2))) -> mean_connectivity_plot
cowplot::plot_grid(scale_independence_plot, mean_connectivity_plot, ncol = 2, align = "h", labels = c("A", "B"), label_size = plot_labeling_size) -> si_mc_plot
si_mc_plot
modules.omics_X <- blockwiseModules(omics_data_host,
power = st,
networkType = "signed",
TOMType = "signed",
corType = assoc_measure,
#maxPOutliers = 0.05,
#deepSplit = 4, # Default 2
minModuleSize = 20, # 30
#minCoreKME = 0.5, # Default 0.5
#minCoreKMESize = 2, # Default minModuleSize/3,
#minKMEtoStay = 0.5, # Default 0.3
#reassignThreshold = 0, # Default 1e-6
#mergeCutHeight = 0.2, # Default 0.15
#pamStage = pam_stage,
#pamRespectsDendro = TRUE,
#replaceMissingAdjacencies = TRUE,
nThreads = 10,
numericLabels = TRUE,
saveTOMs = FALSE,
saveTOMFileBase = "HOST_TOM",
verbose = 3,
maxBlockSize=8000)
hubs <- chooseTopHubInEachModule(omics_data_host, modules.omics_X$colors, power = st, omitColors = "0")
stage2results_X <- list(modules = modules.omics_X,
hubs = hubs)
saveRDS(stage2results_X, "/Users/shashankgupta/Desktop/ImprovAFish/github/ImprovaFish/OmicsIntegration/stage2results_Transcriptomics.rds")
# Convert labels to colors for plotting
merged_colors <- labels2colors(stage2results_X$modules$colors)
n_modules <- unique(merged_colors) %>% length()
samples_good <- sum(stage2results_X$modules$goodSamples) == length(stage2results_X$modules$goodSamples)
genes_good <- sum(stage2results_X$modules$goodGenes) == length(stage2results_X$modules$goodGenes)
ME_good <- sum(stage2results_X$modules$MEsOK) == length(stage2results_X$modules$MEsOK)
table(stage2results_X$modules$colors) %>%
as.data.frame() %>%
dplyr::rename(Module = Var1, Size = Freq) %>%
dplyr::mutate(Module_color = labels2colors(as.numeric(as.character(Module)))) -> module_size
module_size %>%
ggplot(aes(x = Module, y = Size, fill = Module)) +
geom_col(color = "#000000") +
ggtitle("Number of genes in each module") +
theme(legend.position = "none") +
scale_fill_manual(values = setNames(module_size$Module_color,module_size$Module)) +
geom_text(aes(label = Size),vjust = 0.5, hjust = -0.18, size = 3.5) +
ylim(0, max(module_size$Size)*1.1) + theme_bw() +
theme(plot.margin = margin(2, 2, 2, 2, "pt")) +
coord_flip()-> module_size_barplot
module_size_barplot
MEs <- stage2results_X$modules$MEs
# Module correlation to other modules
MEs_R <- bicor(MEs, MEs, maxPOutliers = 0.05)
idx.r <- which(rownames(MEs_R) == "ME0")
idx.c <- which(colnames(MEs_R) == "ME0")
MEs_R_noME0 <- MEs_R[-idx.r, -idx.c]
MEs_R_noME0[upper.tri(MEs_R_noME0)] %>%
as.data.frame() %>%
dplyr::rename("correlation" = ".") %>%
ggplot(aes(correlation)) +
geom_histogram(bins = 20) +
#geom_density() +
xlim(-1, 1) +
ggtitle(paste0(prefix,"ME correlation\n w/o ",prefix ,"ME0")) -> MEs_R_density
pheatmap::pheatmap(MEs_R_noME0, color = colorRampPalette(c("Blue", "White", "Red"))(100),
silent = T,
breaks = seq(-1,1,length.out = 101),
treeheight_row = 10,
treeheight_col = 10,
main = paste0(prefix,"ME correlation heatmap w/o ", prefix ,"ME0"),
labels_row = rep("", length(rownames(MEs_R))), #paste0(prefix, rownames(MEs_R)),
labels_col = rep("", length(colnames(MEs_R))) #paste0(prefix, colnames(MEs_R))
) -> MEs_R_Corr
cowplot::plot_grid(MEs_R_density, MEs_R_Corr$gtable, labels = c("D", "E"), label_size = plot_labeling_size, rel_widths = c(0.6, 1)) -> density_eigen
all(rownames(omics_data_host) == rownames(MEs))
## [1] TRUE
dim(omics_data_host) %>% paste0(c(" samples", " genes"))
## [1] "142 samples" "36878 genes"
kME <- bicor(omics_data_host, MEs, maxPOutliers = 0.05)
dim(kME) %>% paste0(c(" genes", " modules"))
## [1] "36878 genes" "25 modules"
# Corr within modules
corr_within_module <- function(omics_data_host, modules, module_x = 1){
idx.omics_data_host <- which(modules$colors == module_x)
idx.me <- which(colnames(modules$MEs) == paste0("ME",module_x))
kME_x <- bicor(omics_data_host[,idx.omics_data_host], modules$MEs[,idx.me], maxPOutliers = 0.05)
kME_x
}
ggplot.list <- list()
modules <-
colnames(stage2results_X$modules$MEs)[colnames(stage2results_X$modules$MEs) %>% sub("ME", "", .) %>% as.numeric() %>% order()]
for(m in colnames(stage2results_X$modules$MEs)){
h <- as.numeric(sub("ME","", m))
data.frame(x = suppressWarnings(corr_within_module(omics_data_host = omics_data_host, modules = stage2results_X$modules, module_x = h))) %>%
ggplot() +
#geom_density(aes(x = x), fill = labels2colors(h), color = "black", alpha = 0.5) +
geom_histogram(aes(x), fill = labels2colors(h), color = "black", alpha = 0.5, bins = 20) +
xlim(-1, 1) +
xlab("Gene correlation") +
ggtitle(paste0(prefix,m)) -> da_plot
ggplot.list[[m]] <- da_plot
}
ggplot.list <- ggplot.list[ggplot.list %>% names() %>% sub("ME", "", .) %>% as.numeric() %>% order()]
cowplot::plot_grid(plotlist = ggplot.list) -> density_all_plot
# comine to one plot
cowplot::plot_grid(si_mc_plot , density_eigen, ncol = 1, rel_heights = c(0.8,1)) -> part_1
cowplot::plot_grid(part_1, module_size_barplot, labels = c("", "C"), label_size = plot_labeling_size, rel_widths = c(1,0.5)) -> part_2
cowplot::plot_grid(part_2, density_all_plot, ncol = 1, rel_heights = c(0.8,1), labels = c("", "F"), label_size = plot_labeling_size) -> part_3
part_3
This code takes the results of a omics data and extracts the “hubs”, or most highly connected genes in the network, for each module.
stage2results_X$hubs %>%
as.data.frame() %>%
dplyr::rename("gene_id" = ".") %>%
tibble::rownames_to_column(var = "Module") -> top_genes
DT::datatable(top_genes)
Linear regression analysis on the module eigengenes (principal components) from omics data, using the limma package. The sample information is loaded and the factors ‘Time’, ‘New_Diet’, and ‘Day’ are created based on the sample IDs. The design matrix for the regression is created using the ‘Day’ factor, and a linear model fit is performed on the transposed module eigengenes. The regression results are then run through an empirical Bayesian analysis. The topTable function is used to extract the regression statistics and create a data frame with the module names as the row names.
#modules.omics_X.backup <- modules.omics_X
module_eigengenes <- modules.omics_X$MEs
all.equal(sample_info$ID_New, rownames(module_eigengenes))
## [1] TRUE
sample_info %>%
mutate(Time = factor(Time)) %>%
mutate(New_Diet = factor(New_Diet)) %>%
mutate(Day = ifelse(Time == "T1", paste0("T1_", New_Diet),
ifelse(Time == "T2", paste0("T2_", New_Diet), paste0("T3_", New_Diet)))) -> meta
DT::datatable(meta)
des_mat <- model.matrix(~ meta$Day)
fit <- limma::lmFit(t(module_eigengenes), design = des_mat)
fit <- limma::eBayes(fit)
stats_df <- limma::topTable(fit, number = ncol(module_eigengenes)) %>%
tibble::rownames_to_column("module")
DT::datatable(stats_df)
Let’s make plot of module 21
module_df <- module_eigengenes %>%
tibble::rownames_to_column("ID_New") %>%
dplyr::inner_join(meta %>%
dplyr::select(ID_New, Day, Time, New_Diet),
by = c("ID_New" = "ID_New"))
ggplot(module_df, aes(x = Time, y = ME21, color = Day)) +
geom_boxplot(width = 0.2, outlier.shape = NA) +
ggforce::geom_sina(maxwidth = 0.3) +
theme_classic() + facet_wrap("New_Diet")
ggplot(module_df, aes(x = Day, y = ME21, color = Time)) +
geom_boxplot(width = 0.2, outlier.shape = NA) +
ggforce::geom_sina(maxwidth = 0.3) +
theme_classic()
What genes are a part of module 21?
gene_module_key <- tibble::enframe(modules.omics_X$colors, name = "gene", value = "module") %>%
# Let's add the `ME` part so its more clear what these numbers are and it matches elsewhere
dplyr::mutate(module = paste0("ME", module))
# gene_module_key %>%
# dplyr::filter(module == "ME21")
DT::datatable(gene_module_key)
The code in R connects to the AnnotationHub package and selects OrgDb annotations for the species “Salmo salar”. It creates two objects, SsalOrg, by subsetting and extracting the desired annotations from the ah object.
# get OrgDb from AnnotationHub
ah <- AnnotationHub()
SsalOrg <- subset(ah, ah$species == "Salmo salar" & ah$rdataclass=="OrgDb")
SsalOrg <- ah[["AH107424"]]
This script performs gene enrichment analysis on multiple modules of genes. The modules of genes are generated from omics data, and the gene annotations are read from a csv file. The probes from the omics data are matched with the gene IDs from the annotations, and the LocusLink IDs are obtained. The unique modules are then selected, and their LocusLink IDs are written into separate text files. Finally, gene ontology (GO) and KEGG pathway analysis are performed for each module using the enrichGO and enrichKEGG functions, respectively. The results are then visualized using the dotplot function.
moduleLabels <- modules.omics_X$colors
moduleColors <- labels2colors(modules.omics_X$colors)
table(moduleColors)
## moduleColors
## black blue brown cyan darkgreen
## 1311 2480 2192 692 174
## darkgrey darkred darkturquoise green greenyellow
## 118 177 130 1542 864
## grey grey60 lightcyan lightgreen lightyellow
## 13045 460 480 264 248
## magenta midnightblue pink purple red
## 937 676 1029 897 1359
## royalblue salmon tan turquoise yellow
## 225 702 846 4277 1753
# table(moduleLabels)
# table(moduleColors, moduleLabels)
MEs <- modules.omics_X$MEs
geneTree <- modules.omics_X$dendrograms[[1]]
#https://gitlab.com/garethgillard/ssalv3annotation
annot1 <- read.csv("/Users/shashankgupta/Desktop/ImprovAFish/github/Input/Salmo_salar-GCA_905237065.2_gene_annotations.csv", header = TRUE)
probes <- colnames(omics_data_host)
probes2annot <- match(probes, annot1$gene_id)
allLLIDs <- annot1$v2.gene_id.NCBI[probes2annot]
intModules <- unique(moduleColors)
for (module in intModules)
{
# Select module probes
modGenes = (moduleColors==module)
# Get their entrez ID codes
modLLIDs = allLLIDs[modGenes]
# Write them into a file
fileName = paste("LocusLinkIDs-", module, ".txt", sep="");
write.table(as.character(modLLIDs), file = fileName, row.names = FALSE, col.names = FALSE)
}
fileName = paste("LocusLinkIDs-all.txt", sep="");
write.table(as.character(allLLIDs), file = fileName, row.names = FALSE, col.names = FALSE)
setwd("/Users/shashankgupta/Desktop/ImprovAFish/github/ImprovaFish/Transcriptomics/intermediate_files/Gut/")
all.df <- read.table("LocusLinkIDs-all.txt")
all.df$V1 <- as.character(all.df$V1)
# Get unique module colors
list_names <- unique(moduleColors)
for (i in list_names) {
# Read LocusLinkIDs file
assign(paste0(i,"_df"), read.table(paste0("LocusLinkIDs-",i,".txt")))
# Enrich GO terms
assign(paste0("resGO_",i), enrichGO(gene = get(paste0(i,"_df"))$V1, ont = "BP", universe = all.df$V1, OrgDb = SsalOrg, pvalueCutoff = 0.05,qvalueCutoff = 0.05))
# Plot GO terms using dotplot
assign(paste0("resGO_dotplot_",i), dotplot(get(paste0("resGO_",i))))
}
Save all the plots
resGO_dot_plots <- ls(pattern="resGO_dotplot_*")
for (i in 1:length(resGO_dot_plots)) {
tryCatch({
png(paste0(resGO_dot_plots[i], ".png"))
print(get(resGO_dot_plots[i]))
dev.off()
}, error = function(e) {
#print(paste("Error printing", resGO_dot_plots[i], ":", e))
file.remove(paste0(resGO_dot_plots[i], ".png"))
})
}
png_files <- list.files(pattern = "^resGO_dotplot_.*\\.png$")
raster_list <- lapply(png_files, readPNG)
plot <- grid.arrange(grobs = lapply(raster_list, rasterGrob), ncol = 3)
dev.off()
## quartz_off_screen
## 3
moduleColors <- modules.omics_X$colors
mergedMEs <- modules.omics_X$MEs
colorOrder = c("grey", standardColors(50));
moduleLabels = match(moduleColors, colorOrder)-1;
MEs <- mergedMEs
MEs <- MEs[ , -which(names(MEs) %in% c("ME0"))]
modNames <- substring(names(MEs), 3)
# Correlating each genes expression profile with the module eigengenes in order to create module gene sets
geneModuleMembership <- as.data.frame(cor(omics_data_host, MEs, use = "p"))
# Iteratively creating a list of module genesets to test. These are in ensembl ids
moduleGeneSets<-lapply(modNames,function(module){
column = match(module, modNames)
moduleGenes = moduleColors==module
rownames(geneModuleMembership[moduleGenes,])
})
names(moduleGeneSets)<-modNames
# Trimming the module gene sets so that the final two digits after the "." are removed
moduleGeneSets.trimmed<-lapply(moduleGeneSets,function(x){
str_split_fixed(x,"\\.",2)[,1]
})
moduleGeneSets.Entrez <- lapply(moduleGeneSets.trimmed, function(x) {
x <- match(x, annot1$gene_id)
x <- annot1$v2.gene_id.NCBI[x]
return(x)
})
ck<-compareCluster(geneCluster=moduleGeneSets.Entrez,
fun="enrichGO",
OrgDb = SsalOrg,
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
ont = "BP",
universe = all.df$V1)
dotplot(ck)
cols <- c(brewer.pal(8,"Set1"), brewer.pal(7,"Dark2"),brewer.pal(7,"Set2"),brewer.pal(12,"Set3"),brewer.pal(7,"Accent"),brewer.pal(12,"Paired"),"gray")
g<- cnetplot(ck,node_label="category")
g + scale_fill_manual(values =cols)