GSVA calculation for NanoString data

gene_x 0 like s 880 view s

Tags: R, sequencing, pipeline

  1. library("rmarkdown")
  2. library("tidyverse")
  3. library(rmarkdown)
  4. library("GeomxTools")
  5. library("GeoMxWorkflows")
  6. library("NanoStringNCTools")
  7. setwd("/home/jhuang/DATA/Data_Susanne_Carotis_spatialRNA_PUBLISHING/run_2023")
  8. render("run.Rmd", c("html_document"))
  9. # ------------------------ Violin Presentation 1 --------------------
  10. #identical(exprs(target_m666Data), assay(target_m666Data, "exprs")): contains the gene expression values that have been both normalized and log-transformed, which are often used for downstream analysis and visualization in many genomic studies.
  11. #assay(target_m666Data, "log_q"): These are likely log2-transformed values of the quantile-normalized data. Quantile normalization ensures that the distributions of gene expression values across samples are the same, and taking the log2 of these normalized values is a common step in the analysis of high-throughput gene expression data. It's a way to make the data more suitable for downstream statistical analysis and visualization.
  12. #assay(target_m666Data, "q_norm"): This typically represents the quantile-normalized gene expression values. Quantile normalization adjusts the expression values in each sample so that they have the same distribution. This normalization method helps remove systematic biases and makes the data more comparable across samples.
  13. library(readxl)
  14. # Path to the Excel file
  15. file_path <- "Signatures.xls"
  16. #example of a signature:
  17. #geneSymbol geneEntrezID ENSEMBL GeneSet
  18. #CD160 11126 ENSG00000117281 Anergic or act. T cells
  19. #CD244 51744 ENSG00000122223 Anergic or act. T cells
  20. #CTLA4 1493 ENSG00000163599 Anergic or act. T cells
  21. #HAVCR2 84868 ENSG00000135077 Anergic or act. T cells
  22. #ICOS 29851 ENSG00000163600 Anergic or act. T cells
  23. #KLRG1 10219 ENSG00000139187 Anergic or act. T cells
  24. #LAG3 3902 ENSG00000089692 Anergic or act. T cells
  25. #PDCD1 5133 ENSG00000188389 Anergic or act. T cells
  26. #PDCD1 5133 ENSG00000276977 Anergic or act. T cells
  27. # Get the names of the sheets
  28. sheet_names <- excel_sheets(file_path)
  29. # Initialize an empty list to hold gene sets
  30. geneSets <- list()
  31. # Loop over each sheet, extract the ENSEMBL IDs, and add to the list
  32. for (sheet in sheet_names) {
  33. # Read the sheet
  34. data <- read_excel(file_path, sheet = sheet)
  35. # Process the GeneSet names (replacing spaces with underscores, for example)
  36. gene_set_name <- gsub(" ", "_", unique(data$GeneSet)[1])
  37. # Add ENSEMBL IDs to the list
  38. geneSets[[gene_set_name]] <- unique(as.character(data$geneSymbol))
  39. }
  40. # Print the result to check
  41. print(geneSets)
  42. library(gridExtra)
  43. library(ggplot2)
  44. library(GSVA)
  45. # 0. for the following calculation, the GSVA input requires a gene expression matrix 'exprs' where rows are genes and columns are samples.
  46. exprs <- exprs(target_m666Data)
  47. #exprs <- exprs(filtered_or_neg_target_m666Data)
  48. # 1. Compute GSVA scores:
  49. gsva_scores <- gsva(exprs, geneSets, method="gsva")
  50. # 2. Convert to data.frame for ggplot:
  51. gsva_df <- as.data.frame(t(gsva_scores))
  52. # 3. Add conditions to gsva_df:
  53. identical(rownames(pData(target_m666Data)), rownames(gsva_df))
  54. gsva_df$Condition <- pData(target_m666Data)$Grp
  55. #identical(rownames(gsva_df_filtered), rownames(pData(target_m666Data)) )
  56. gsva_df$SampleID <- pData(target_m666Data)$SampleID
  57. # 4. Filter the gsva_df to retain only the desired conditions:
  58. #group 1 vs. group 3 in the nanostring data
  59. gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("1", "3"), ]
  60. # 5. Define a function to plot violin plots:
  61. # Update the condition levels in gsva_df_filtered to ensure the desired order on x-axis:
  62. gsva_df_filtered$Condition <- gsub("1", "Group1", gsva_df_filtered$Condition)
  63. gsva_df_filtered$Condition <- gsub("3", "Group3", gsva_df_filtered$Condition)
  64. gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("Group1", "Group3"))
  65. plot_violin <- function(data, gene_name) {
  66. # Calculate the t-test p-value for the two conditions
  67. condition1_data <- data[data$Condition == "Group1", gene_name]
  68. condition2_data <- data[data$Condition == "Group3", gene_name]
  69. p_value <- t.test(condition1_data, condition2_data)$p.value
  70. # Convert p-value to annotation
  71. p_annotation <- ifelse(p_value < 0.01, "**", ifelse(p_value < 0.05, "*", ""))
  72. rounded_p_value <- paste0("p = ", round(p_value, 2))
  73. plot_title <- gsub("_", " ", gene_name)
  74. p <- ggplot(data, aes(x=Condition, y=!!sym(gene_name))) +
  75. geom_violin(linewidth=1.2) +
  76. labs(title=plot_title, y="GSVA Score") +
  77. ylim(-1, 1) +
  78. theme_light() +
  79. theme(
  80. axis.title.x = element_text(size=12),
  81. axis.title.y = element_text(size=12),
  82. axis.text.x = element_text(size=10),
  83. axis.text.y = element_text(size=10),
  84. plot.title = element_text(size=12, hjust=0.5)
  85. )
  86. # Add p-value annotation to the plot
  87. p <- p + annotate("text", x=1.5, y=0.9, label=paste0(p_annotation, " ", rounded_p_value), size=5, hjust=0.5)
  88. return(p)
  89. }
  90. # 6. Generate the list of plots:
  91. #genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
  92. #plots_list <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene))
  93. # 6. Generate the list of plots in a predefined order:
  94. desired_order <- c("Platelets","Granulocytes","LDG","pDC","Anti-inflammation", "Pro-inflam._IL-1","Dendritic_cells","MHC_II","Alt._complement","TNF", "NLRP3_inflammasome","Unfolded_protein","B_cells","Monocyte_cell_surface","Inflammasome", "Monocyte_secreted","IL-1_cytokines","SNOR_low_UP","CD40_activated","Lectin_complement", "Classical_complement","Cell_cycle","Plasma_cells","IG_chains","Erythrocytes", "IL-6R_complex","IFN","TCRB","TCRA","Cyt._act._T_cells", "TCRG","T_cells","CD8T-NK-NKT","Anergic_or_act._T_cells","T_activated", "NK_cells","TCRD","T_regs","SNOR_low_DOWN","Monocytes", "Myeloid_cells","Neutrophils")
  95. genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
  96. genes <- genes[match(desired_order, genes)]
  97. genes <- genes[!is.na(genes)]
  98. plots_list <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene))
  99. # 7. Pad the list of plots:
  100. remaining_plots <- 6 - (length(plots_list) %% 6)
  101. if (remaining_plots != 6) {
  102. plots_list <- c(plots_list, rep(list(NULL), remaining_plots))
  103. }
  104. # 7. Pad the list of plots:
  105. #plots_needed_for_full_grid <- ceiling(length(plots_list) / 6) * 6
  106. #remaining_plots <- plots_needed_for_full_grid - length(plots_list)
  107. #plots_list <- c(plots_list, rep(list(NULL), remaining_plots))
  108. # 8. Create the plots and arrange them in a grid:
  109. library(gridExtra)
  110. plots_matrix <- matrix(plots_list, ncol=6, byrow=T)
  111. #do.call("grid.arrange", c(plots_matrix, list(ncol=6)))
  112. # 9. Save the plots to a PNG:
  113. png("All_Violin_Plots_NanoString_3_vs_1.png", width=1000, height=1000)
  114. do.call("grid.arrange", c(plots_matrix, list(ncol=6)))
  115. dev.off()
  116. # ------------------------ Violin Presentation 2 --------------------
  117. # 2. Convert to data.frame for ggplot:
  118. gsva_df <- as.data.frame(t(gsva_scores))
  119. # 3. Add conditions to gsva_df:
  120. identical(rownames(pData(target_m666Data)), rownames(gsva_df))
  121. gsva_df$Condition <- pData(target_m666Data)$Group
  122. #identical(rownames(gsva_df_filtered), rownames(pData(target_m666Data)) )
  123. gsva_df$SampleID <- pData(target_m666Data)$SampleID
  124. # 4. Filter the gsva_df to retain only the desired conditions:
  125. #group 1 vs. group 3 in the nanostring data
  126. gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("1a", "3"), ]
  127. # 5. Define a function to plot violin plots:
  128. # Update the condition levels in gsva_df_filtered to ensure the desired order on x-axis:
  129. gsva_df_filtered$Condition <- gsub("1a", "Group1a", gsva_df_filtered$Condition)
  130. gsva_df_filtered$Condition <- gsub("3", "Group3", gsva_df_filtered$Condition)
  131. gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("Group1a", "Group3"))
  132. plot_violin <- function(data, gene_name) {
  133. # Calculate the t-test p-value for the two conditions
  134. condition1_data <- data[data$Condition == "Group1a", gene_name]
  135. condition2_data <- data[data$Condition == "Group3", gene_name]
  136. p_value <- t.test(condition1_data, condition2_data)$p.value
  137. # Convert p-value to annotation
  138. p_annotation <- ifelse(p_value < 0.01, "**", ifelse(p_value < 0.05, "*", ""))
  139. rounded_p_value <- paste0("p = ", round(p_value, 2))
  140. plot_title <- gsub("_", " ", gene_name)
  141. p <- ggplot(data, aes(x=Condition, y=!!sym(gene_name))) +
  142. geom_violin(linewidth=1.2) +
  143. labs(title=plot_title, y="GSVA Score") +
  144. ylim(-1, 1) +
  145. theme_light() +
  146. theme(
  147. axis.title.x = element_text(size=12),
  148. axis.title.y = element_text(size=12),
  149. axis.text.x = element_text(size=10),
  150. axis.text.y = element_text(size=10),
  151. plot.title = element_text(size=12, hjust=0.5)
  152. )
  153. # Add p-value annotation to the plot
  154. p <- p + annotate("text", x=1.5, y=0.9, label=paste0(p_annotation, " ", rounded_p_value), size=5, hjust=0.5)
  155. return(p)
  156. }
  157. # 6. Generate the list of plots:
  158. #genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
  159. #plots_list <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene))
  160. # 6. Generate the list of plots in a predefined order:
  161. desired_order <- c("Platelets","Granulocytes","LDG","pDC","Anti-inflammation", "Pro-inflam._IL-1","Dendritic_cells","MHC_II","Alt._complement","TNF", "NLRP3_inflammasome","Unfolded_protein","B_cells","Monocyte_cell_surface","Inflammasome", "Monocyte_secreted","IL-1_cytokines","SNOR_low_UP","CD40_activated","Lectin_complement", "Classical_complement","Cell_cycle","Plasma_cells","IG_chains","Erythrocytes", "IL-6R_complex","IFN","TCRB","TCRA","Cyt._act._T_cells", "TCRG","T_cells","CD8T-NK-NKT","Anergic_or_act._T_cells","T_activated", "NK_cells","TCRD","T_regs","SNOR_low_DOWN","Monocytes", "Myeloid_cells","Neutrophils")
  162. genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
  163. genes <- genes[match(desired_order, genes)]
  164. genes <- genes[!is.na(genes)]
  165. plots_list <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene))
  166. # 7. Pad the list of plots:
  167. remaining_plots <- 6 - (length(plots_list) %% 6)
  168. if (remaining_plots != 6) {
  169. plots_list <- c(plots_list, rep(list(NULL), remaining_plots))
  170. }
  171. # 7. Pad the list of plots:
  172. #plots_needed_for_full_grid <- ceiling(length(plots_list) / 6) * 6
  173. #remaining_plots <- plots_needed_for_full_grid - length(plots_list)
  174. #plots_list <- c(plots_list, rep(list(NULL), remaining_plots))
  175. # 8. Create the plots and arrange them in a grid:
  176. library(gridExtra)
  177. plots_matrix <- matrix(plots_list, ncol=6, byrow=T)
  178. #do.call("grid.arrange", c(plots_matrix, list(ncol=6)))
  179. # 9. Save the plots to a PNG:
  180. png("All_Violin_Plots_NanoString_3_vs_1a.png", width=1000, height=1000)
  181. do.call("grid.arrange", c(plots_matrix, list(ncol=6)))
  182. dev.off()
  183. # ------------------------ Heatmap Presentation --------------------
  184. # 3. Add conditions to gsva_df:
  185. identical(rownames(pData(target_m666Data)), rownames(gsva_df))
  186. gsva_df$Condition <- pData(target_m666Data)$Grp
  187. gsva_df$SampleID <- pData(target_m666Data)$SampleID
  188. # 4. Filter the gsva_df to retain only the desired conditions:
  189. gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("1", "3"), ]
  190. # 5. Define a function to plot violin plots:
  191. # Update the condition levels in gsva_df_filtered to ensure the desired order on x-axis:
  192. gsva_df_filtered$Condition <- gsub("1", "Group1", gsva_df_filtered$Condition)
  193. gsva_df_filtered$Condition <- gsub("3", "Group3", gsva_df_filtered$Condition)
  194. gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("Group1", "Group3"))
  195. # Filter the data for "Group1" samples
  196. group1_data <- gsva_df_filtered[gsva_df_filtered$Condition == "Group1", genes]
  197. # Set the median values for each column in the data.frame
  198. # Calculate median values for each column
  199. group1_medians <- apply(group1_data, 2, median)
  200. # 3. Add conditions to gsva_df:
  201. identical(rownames(pData(target_m666Data)), rownames(gsva_df))
  202. gsva_df$Condition <- pData(target_m666Data)$Group
  203. #identical(rownames(gsva_df_filtered), rownames(pData(target_m666Data)) )
  204. gsva_df$SampleID <- pData(target_m666Data)$SampleID
  205. # 4. Filter the gsva_df to retain only the desired conditions:
  206. #group 1 vs. group 3 in the nanostring data
  207. gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("1a", "1b", "3"), ]
  208. # 5. Define a function to plot violin plots:
  209. # Update the condition levels in gsva_df_filtered to ensure the desired order on x-axis:
  210. gsva_df_filtered$Condition <- gsub("1a", "Group1a", gsva_df_filtered$Condition)
  211. gsva_df_filtered$Condition <- gsub("1b", "Group1b", gsva_df_filtered$Condition)
  212. gsva_df_filtered$Condition <- gsub("3", "Group3", gsva_df_filtered$Condition)
  213. gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("Group1a", "Group1b", "Group3"))
  214. # Filter the data for "Group1a" samples
  215. group1a_data <- gsva_df_filtered[gsva_df_filtered$Condition == "Group1a", genes]
  216. group1a_medians <- apply(group1a_data, 2, median)
  217. # Filter the data for "Group1b" samples
  218. group1b_data <- gsva_df_filtered[gsva_df_filtered$Condition == "Group1b", genes]
  219. group1b_medians <- apply(group1b_data, 2, median)
  220. # Filter the data for "Group3" samples
  221. group3_data <- gsva_df_filtered[gsva_df_filtered$Condition == "Group3", genes]
  222. group3_medians <- apply(group3_data, 2, median)
  223. # Create a new data frame with the middle values for both groups
  224. heatmap_data_nanostring <- data.frame(Group1 = group1_medians, Group1a = group1a_medians, Group1b = group1b_medians, Group3 = group3_medians)
  225. # Merge two heatmap_data_* to a matrix
  226. heatmap_data <- merge(heatmap_data_nanostring, heatmap_data_rnaseq, by="row.names", all=TRUE)
  227. rownames(heatmap_data) <- heatmap_data$Row.names
  228. heatmap_data$Row.names <- NULL
  229. # Transpose the data matrix to switch rows and columns
  230. #heatmap_data <- t(heatmap_data)
  231. # Convert the data frame to a numeric matrix
  232. heatmap_matrix <- as.matrix(heatmap_data)
  233. # Transpose the data matrix to switch rows and columns
  234. #heatmap_data <- t(heatmap_data)
  235. # Convert the data frame to a numeric matrix
  236. heatmap_matrix <- as.matrix(heatmap_data)
  237. # Specify heatmap colors
  238. color_scheme <- colorRampPalette(c("blue", "white", "red"))(50)
  239. #TODO: adjust the color, so that it is not so dark and light!!!!, add RNAseq data for mock and infection!!
  240. # Create the heatmap without hierarchical clustering
  241. library(gplots)
  242. png("Heatmap.png", width=1000, height=1000)
  243. heatmap.2(
  244. heatmap_matrix,
  245. Colv = FALSE, # Turn off column dendrogram
  246. Rowv = FALSE, # Turn off row dendrogram
  247. trace = "none", # Turn off row and column dendrograms
  248. scale = "row", # Scale the rows
  249. col = color_scheme,
  250. main = "GSVA Heatmap", # Title of the heatmap
  251. key = TRUE, # Show color key
  252. key.title = "GSVA Score", # Color key title
  253. key.xlab = NULL, # Remove x-axis label from color key
  254. density.info = "none", # Remove density plot
  255. cexRow = 1.2, # Increase font size of row names
  256. cexCol = 1.2, # Increase font size of column names
  257. srtCol = 40, keysize = 0.72, margins = c(5, 14)
  258. )
  259. dev.off()
  260. # Write the table to Excel-file
  261. library(writexl)
  262. # Specify the file path where you want to save the Excel file
  263. excel_file <- "heatmap_data.xlsx"
  264. # Save the data frame to the Excel file
  265. write_xlsx(heatmap_data, excel_file)
  266. # Verify that the Excel file has been saved
  267. if (file.exists(excel_file)) {
  268. cat("Data frame has been saved to", excel_file, "\n")
  269. } else {
  270. cat("Error: Failed to save the data frame to", excel_file, "\n")
  271. }

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum