RNA-seq skin organoids on GRCh38+chrHsv1

gene_x 0 like s 916 view s

Tags: R, pipeline

  1. import data and pca-plot

    1. # Import the required libraries
    2. library("AnnotationDbi")
    3. library("clusterProfiler")
    4. library("ReactomePA")
    5. library(gplots)
    6. library(tximport)
    7. library(DESeq2)
    8. setwd("~/DATA/Data_Manja_RNAseq_Organoids_Virus/results_chrHsv1_downstream/star_salmon")
    9. # Define paths to your Salmon output quantification files
    10. files <- c("control_r1" = "./control_r1/quant.sf",
    11. "control_r2" = "./control_r2/quant.sf",
    12. "HSV.d2_r1" = "./HSV.d2_r1/quant.sf",
    13. "HSV.d2_r2" = "./HSV.d2_r2/quant.sf",
    14. "HSV.d4_r1" = "./HSV.d4_r1/quant.sf",
    15. "HSV.d4_r2" = "./HSV.d4_r2/quant.sf",
    16. "HSV.d6_r1" = "./HSV.d6_r1/quant.sf",
    17. "HSV.d6_r2" = "./HSV.d6_r2/quant.sf",
    18. "HSV.d8_r1" = "./HSV.d8_r1/quant.sf",
    19. "HSV.d8_r2" = "./HSV.d8_r2/quant.sf")
    20. # Import the transcript abundance data with tximport
    21. txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE)
    22. # Define the replicates and condition of the samples
    23. replicate <- factor(c("r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2"))
    24. condition <- factor(c("control", "control", "HSV.d2", "HSV.d2", "HSV.d4", "HSV.d4", "HSV.d6", "HSV.d6", "HSV.d8", "HSV.d8"))
    25. # Define the colData for DESeq2
    26. colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
    27. # Create DESeqDataSet object
    28. dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
    29. # In the context of your new code which is using tximport and DESeq2, you don't necessarily need this step. The reason is that DESeq2 performs its own filtering of low-count genes during the normalization and differential expression steps.
    30. # Filter data to retain only genes with more than 2 counts > 3 across all samples
    31. # dds <- dds[rowSums(counts(dds) > 3) > 2, ]
    32. # Output raw count data to a CSV file
    33. write.csv(counts(dds), file="transcript_counts.csv")
    34. # -- gene-level count data --
    35. # Read in the tx2gene map from salmon_tx2gene.tsv
    36. #tx2gene <- read.csv("salmon_tx2gene.tsv", sep="\t", header=FALSE)
    37. tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE)
    38. # Set the column names
    39. colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name")
    40. # Remove the gene_name column if not needed
    41. tx2gene <- tx2gene[,1:2]
    42. # Import and summarize the Salmon data with tximport
    43. txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut = FALSE)
    44. # Continue with the DESeq2 workflow as before...
    45. colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
    46. dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
    47. #dds <- dds[rowSums(counts(dds) > 3) > 2, ] #60605-->26543
    48. write.csv(counts(dds, normalized=FALSE), file="gene_counts.csv")
    49. #~/Tools/csv2xls-0.4/csv_to_xls.py gene_counts.csv -d',' -o gene_counts.xls
    50. #TODO: why a lot of reads were removed due to the too_short?
    51. #STAR --runThreadN 4 --genomeDir /path/to/GenomeDir --readFilesIn /path/to/read1.fastq /path/to/read2.fastq --outFilterMatchNmin 50 --outSAMtype BAM SortedByCoordinate --outFileNamePrefix /path/to/output
    52. dim(counts(dds))
    53. head(counts(dds), 10)
    54. #DEBUG: DESeq should not used here!?
    55. #TODO_NEXT_WEEK: rerun without fistly DESeq(dds) to compare if the results is the same to process_1
    56. #dds <- DESeq(dds)
    57. rld <- rlogTransformation(dds)
    58. # draw simple pca and heatmap
    59. library(gplots)
    60. library("RColorBrewer")
    61. #mat <- assay(rld)
    62. #mm <- model.matrix(~condition, colData(rld))
    63. #mat <- limma::removeBatchEffect(mat, batch=rld$batch, design=mm)
    64. #assay(rld) <- mat
    65. # -- pca --
    66. png("pca.png", 1200, 800)
    67. plotPCA(rld, intgroup=c("condition"))
    68. dev.off()
    69. # -- heatmap --
    70. png("heatmap.png", 1200, 800)
    71. distsRL <- dist(t(assay(rld)))
    72. mat <- as.matrix(distsRL)
    73. hc <- hclust(distsRL)
    74. hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
    75. heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
    76. dev.off()
  2. draw 3D PCA plots.

    1. library(gplots)
    2. library("RColorBrewer")
    3. library(ggplot2)
    4. data <- plotPCA(rld, intgroup=c("condition", "replicate"), returnData=TRUE)
    5. write.csv(data, file="plotPCA_data.csv")
    6. #calculate all PCs including PC3 with the following codes
    7. library(genefilter)
    8. ntop <- 500
    9. rv <- rowVars(assay(rld))
    10. select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
    11. mat <- t( assay(rld)[select, ] )
    12. pc <- prcomp(mat)
    13. pc$x[,1:3]
    14. #df_pc <- data.frame(pc$x[,1:3])
    15. df_pc <- data.frame(pc$x)
    16. identical(rownames(data), rownames(df_pc)) #-->TRUE
    17. data$PC1 <- NULL
    18. data$PC2 <- NULL
    19. merged_df <- merge(data, df_pc, by = "row.names")
    20. #merged_df <- merged_df[, -1]
    21. row.names(merged_df) <- merged_df$Row.names
    22. merged_df$Row.names <- NULL # remove the "name" column
    23. merged_df$name <- NULL
    24. merged_df <- merged_df[, c("PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","PC9","PC10","group","condition","replicate")]
    25. write.csv(merged_df, file="merged_df_10PCs.csv")
    26. summary(pc)
    27. #0.5333 0.2125 0.06852
    28. #0.8026 0.09042 0.06578
    29. draw_3D.py
    30. # adjust proportion to real values in the following plot
    31. import plotly.graph_objects as go
    32. import pandas as pd
    33. from sklearn.decomposition import PCA
    34. import numpy as np
    35. # Read in data as a pandas dataframe
    36. #df = pd.DataFrame({
    37. # 'PC1': [-13.999925, -12.504291, -12.443057, -13.065235, -17.316215],
    38. # 'PC2': [-1.498823, -3.342411, -6.067055, -8.205809, 3.293993],
    39. # 'PC3': [-3.335085, 15.207755, -14.725450, 15.078469, -6.917358],
    40. # 'condition': ['GFP d3', 'GFP d3', 'GFP d8', 'GFP d8', 'GFP+mCh d9/12'],
    41. # 'replicate': ['DI', 'DII', 'DI', 'DII', 'DI']
    42. #})
    43. df = pd.read_csv('merged_df_10PCs.csv', index_col=0, header=0)
    44. df['condition'] = df['condition'].replace("control", "control")
    45. df['condition'] = df['condition'].replace("HSV.d2", "day 2")
    46. df['condition'] = df['condition'].replace("HSV.d4", "day 4")
    47. df['condition'] = df['condition'].replace("HSV.d6", "day 6")
    48. df['condition'] = df['condition'].replace("HSV.d8", "day 8")
    49. # Fit PCA model to reduce data dimensions to 3
    50. pca = PCA(n_components=3)
    51. pca.fit(df.iloc[:, :-3])
    52. X_reduced = pca.transform(df.iloc[:, :-3])
    53. # Get variance ratios
    54. explained_variance_ratio = pca.explained_variance_ratio_
    55. # Add reduced data back to dataframe
    56. df['PC1'] = X_reduced[:, 0]
    57. df['PC2'] = X_reduced[:, 1]
    58. df['PC3'] = X_reduced[:, 2]
    59. # Create PCA plot with 3D scatter
    60. fig = go.Figure()
    61. ##ff7f00
    62. condition_color_map = {
    63. 'control': 'rgb(100, 100, 100)',
    64. 'day 2': '#33a02c',
    65. 'day 4': '#1f78b4',
    66. 'day 6': '#e31a1c',
    67. 'day 8': 'magenta'
    68. }
    69. replicate_symbol_map = {'r1': 'circle', 'r2': 'diamond'}
    70. for replicate, replicate_symbol in replicate_symbol_map.items():
    71. for condition, condition_color in condition_color_map.items():
    72. mask = (df['condition'] == condition) & (df['replicate'] == replicate)
    73. fig.add_trace(go.Scatter3d(x=df.loc[mask, 'PC1'], y=df.loc[mask, 'PC2'], z=df.loc[mask, 'PC3'],
    74. mode='markers',
    75. name=f'{condition}' if replicate == 'r1' else None,
    76. legendgroup=f'{condition}',
    77. showlegend=True if replicate == 'r1' else False,
    78. marker=dict(size=6 if replicate_symbol in ['diamond'] else 10, opacity=0.8, color=condition_color, symbol=replicate_symbol)))
    79. for replicate, replicate_symbol in replicate_symbol_map.items():
    80. fig.add_trace(go.Scatter3d(x=[None], y=[None], z=[None],
    81. mode='markers',
    82. name=replicate,
    83. legendgroup=f'{replicate}',
    84. showlegend=True,
    85. marker=dict(size=10, opacity=1, color='black', symbol=replicate_symbol),
    86. hoverinfo='none'))
    87. # Annotations for the legend blocks
    88. #TODO: calculate the PC values.
    89. #TODO: adjust the axis length according to the actual size of axis!
    90. fig.update_layout(
    91. annotations=[
    92. dict(x=1.1, y=1.0, xref='paper', yref='paper', showarrow=False,
    93. text='Condition', font=dict(size=15)),
    94. dict(x=1.1, y=0.6, xref='paper', yref='paper', showarrow=False,
    95. text='Replicate', font=dict(size=15))
    96. ],
    97. scene=dict(
    98. #aspectmode='cube',
    99. #xaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC1: 53% v.', scaleratio=0.53),
    100. #yaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC2: 21% v.', scaleratio=0.21),
    101. #zaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC3: 7% variance', scaleratio=0.07),
    102. aspectmode='manual',
    103. aspectratio=dict(x=explained_variance_ratio[0], y=explained_variance_ratio[1], z=explained_variance_ratio[2]),
    104. xaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC1: 53% v.', range=[min(df['PC1']), max(df['PC1'])]),
    105. yaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC2: 21% v.', range=[min(df['PC2']), max(df['PC2'])]),
    106. zaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC3: 7% variance', range=[min(df['PC3']), max(df['PC3'])]),
    107. bgcolor='white'
    108. ),
    109. margin=dict(l=5, r=5, b=5, t=0) # Adjust the margins to prevent clipping of axis titles
    110. )
    111. #fig.show()
    112. fig.write_image("fig1.svg")
    113. fig.update_layout(
    114. annotations=[
    115. dict(x=1.1, y=1.0, xref='paper', yref='paper', showarrow=False,
    116. text='Condition', font=dict(size=15)),
    117. dict(x=1.1, y=0.6, xref='paper', yref='paper', showarrow=False,
    118. text='Donor', font=dict(size=15)),
    119. dict(x=1.08, y=0.2, xref='paper', yref='paper', showarrow=False,
    120. text=f'PC3: {explained_variance_ratio[2]*100:.2f}% v.', font=dict(size=15), textangle=-90)
    121. ],
    122. scene=dict(
    123. aspectmode='manual',
    124. aspectratio=dict(x=explained_variance_ratio[0]*2, y=explained_variance_ratio[1]*2, z=explained_variance_ratio[2]*2),
    125. #, range=[min(df['PC1']), max(df['PC1'])]
    126. #, range=[min(df['PC2']), max(df['PC2'])]
    127. #, range=[min(df['PC3']), max(df['PC3'])]
    128. xaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title=f'PC1: {explained_variance_ratio[0]*100:.2f}% variance'),
    129. yaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title=f'PC2: {explained_variance_ratio[1]*100:.2f}% v.'),
    130. zaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title=''),
    131. bgcolor='white'
    132. ),
    133. margin=dict(l=0, r=0, b=0, t=0) # Adjust the margins to prevent clipping of axis titles
    134. )
    135. fig.write_image("PCA_3D.svg")
    136. #/usr/bin/convert g235.png -crop 3250x1680+1+750 PCA_3D_.png
  3. (optional) estimate size factors

    1. > head(dds)
    2. class: DESeqDataSet
    3. dim: 6 10
    4. metadata(1): version
    5. assays(6): counts avgTxLength ... H cooks
    6. rownames(6): ENSG00000000003 ENSG00000000005 ... ENSG00000000460
    7. ENSG00000000938
    8. rowData names(34): baseMean baseVar ... deviance maxCooks
    9. colnames(10): control_r1 control_r2 ... HSV.d8_r1 HSV.d8_r2
    10. colData names(2): condition replicate
    11. #convert bam to bigwig using deepTools by feeding inverse of DESeq’s size Factor
    12. sizeFactors(dds)
    13. #NULL
    14. dds <- estimateSizeFactors(dds)
    15. > sizeFactors(dds)
    16. raw_counts <- counts(dds)
    17. normalized_counts <- counts(dds, normalized=TRUE)
    18. #write.table(raw_counts, file="raw_counts.txt", sep="\t", quote=F, col.names=NA)
    19. #write.table(normalized_counts, file="normalized_counts.txt", sep="\t", quote=F, col.names=NA)
    20. # ---- DEBUG sizeFactors(dds) always NULL, see https://support.bioconductor.org/p/97676/ ----
    21. nm <- assays(dds)[["avgTxLength"]]
    22. sf <- estimateSizeFactorsForMatrix(counts(dds), normMatrix=nm)
    23. assays(dds)$counts # for count data
    24. assays(dds)$avgTxLength # for average transcript length, etc.
    25. assays(dds)$normalizationFactors
    26. In normal circumstances, the size factors should be stored in the DESeqDataSet object itself and not in the assays, so they are typically not retrievable via the assays() function. However, due to the issues you're experiencing, you might be able to manually compute the size factors and assign them back to the DESeqDataSet.
    27. To calculate size factors manually, DESeq2 uses the median ratio method. Here's a very simplified version of how you could compute this manually:
    28. > assays(dds)
    29. List of length 6
    30. names(6): counts avgTxLength normalizationFactors mu H cooks
    31. To calculate size factors manually, DESeq2 uses the median ratio method. Here's a very simplified version of how you could compute this manually:
    32. geoMeans <- apply(assays(dds)$counts, 1, function(row) if (all(row == 0)) 0 else exp(mean(log(row[row != 0]))))
    33. sizeFactors(dds) <- median(assays(dds)$counts / geoMeans, na.rm = TRUE)
    34. # ---- DEBUG END ----
    35. #unter konsole
    36. # control_r1 ...
    37. # 1/0.9978755 ...
    38. > sizeFactors(dds)
    39. HeLa_TO_r1 HeLa_TO_r2
    40. 0.9978755 1.1092227
    41. 1/0.9978755=1.002129023
    42. 1/1.1092227=
    43. #bamCoverage --bam ../markDuplicates/${sample}Aligned.sortedByCoord.out.bam -o ${sample}_norm.bw --binSize 10 --scaleFactor --effectiveGenomeSize 2864785220
    44. bamCoverage --bam ../markDuplicates/HeLa_TO_r1Aligned.sortedByCoord.out.markDups.bam -o HeLa_TO_r1.bw --binSize 10 --scaleFactor 1.002129023 --effectiveGenomeSize 2864785220
    45. bamCoverage --bam ../markDuplicates/HeLa_TO_r2Aligned.sortedByCoord.out.markDups.bam -o HeLa_TO_r2.bw --binSize 10 --scaleFactor 0.901532217 --effectiveGenomeSize 2864785220
  4. compare the normalization methods

    1. # ---- draw normalization before and after ----
    2. ### Let's implement such a function
    3. ### cds is a countDataset
    4. estimSf <- function (cds){
    5. # Get the count matrix
    6. cts <- counts(cds)
    7. # Compute the geometric mean
    8. geomMean <- function(x) prod(x)^(1/length(x))
    9. # Compute the geometric mean over the line
    10. gm.mean <- apply(cts, 1, geomMean)
    11. # Zero values are set to NA (avoid subsequentcdsdivision by 0)
    12. gm.mean[gm.mean == 0] <- NA
    13. # Divide each line by its corresponding geometric mean
    14. # sweep(x, MARGIN, STATS, FUN = "-", check.margin = TRUE, ...)
    15. # MARGIN: 1 or 2 (line or columns)
    16. # STATS: a vector of length nrow(x) or ncol(x), depending on MARGIN
    17. # FUN: the function to be applied
    18. cts <- sweep(cts, 1, gm.mean, FUN="/")
    19. # Compute the median over the columns
    20. med <- apply(cts, 2, median, na.rm=TRUE)
    21. # Return the scaling factor
    22. return(med)
    23. }
    24. #https://dputhier.github.io/ASG/practicals/rnaseq_diff_Snf2/rnaseq_diff_Snf2.html
    25. #http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#data-transformations-and-visualization
    26. #https://hbctraining.github.io/DGE_workshop/lessons/02_DGE_count_normalization.html
    27. #https://hbctraining.github.io/DGE_workshop/lessons/04_DGE_DESeq2_analysis.html
    28. #https://genviz.org/module-04-expression/0004/02/01/DifferentialExpression/
    29. #DESeq2’s median of ratios [1]
    30. #EdgeR’s trimmed mean of M values (TMM) [2]
    31. #http://www.nathalievialaneix.eu/doc/html/TP1_normalization.html #very good website!
    32. test_normcount <- sweep(raw_counts, 2, sizeFactors(dds), "/")
    33. sum(test_normcount != normalized_counts)
    34. head(estimSf(dds))
    35. all(round(estimSf(dds),6) == round(sizeFactors(dds), 6))
    36. ## Checking the normalization
    37. png("normalization.png", width=800, height=600)
    38. epsilon <- 1 # pseudo-count to avoid problems with log(0)
    39. par(mfrow=c(1,2),cex.lab=0.7)
    40. boxplot(log2(raw_counts+epsilon), cex.axis=0.7, las=1, xlab="log2(raw counts)", horizontal=TRUE, main="Raw counts")
    41. boxplot(log2(normalized_counts+epsilon), cex.axis=0.7, las=1, xlab="log2(normalized counts)", horizontal=TRUE, main="Normalized counts")
    42. #boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2)
    43. #plotDensity(log2(counts(dds.norm)+epsilon), col=col.pheno.selected,
    44. # xlab="log2(counts)", cex.lab=0.7, panel.first=grid())
    45. #plotDensity(log2(counts(dds.norm, normalized=TRUE)+epsilon), col=col.pheno.selected,
    46. # xlab="log2(normalized counts)", cex.lab=0.7, panel.first=grid())
    47. dev.off()
    48. # since we Gene-level differential expression analysis with DESeq2, the splicing plays no role in the analysis!
    49. # 用nanopore 可以 compare transcript length distribution. 有可能Cellline很长,Extracellular vesicles (EVs)很短!
    50. library(ggplot2)
    51. library(gridExtra)
    52. library(reshape2)
    53. library(mixOmics)
    54. library(RColorBrewer)
    55. library(DESeq)
    56. library(edgeR)
    57. library(VennDiagram)
    58. library(devtools)
    59. raw_counts_wn <- raw_counts[rowSums(raw_counts) > 0, ]
    60. dim(raw_counts_wn)
    61. #--Raw counts--
    62. pseudo_counts <- log2(raw_counts_wn + 1)
    63. head(pseudo_counts)
    64. df_raw <- melt(pseudo_counts, id = rownames(raw_counts_wn))
    65. names(df_raw)[1:2]<- c("id", "sample")
    66. df_raw$method <- rep("Raw counts", nrow(df_raw))
    67. head(df_raw)
    68. #--DESeq--
    69. cData = data.frame(row.names=colnames(raw_counts_wn), replicates=replicates, ids=ids)
    70. dge<-DESeqDataSetFromMatrix(countData=raw_counts_wn, colData=cData, design=~replicates)
    71. dge <- estimateSizeFactors(dge)
    72. sizeFactors(dge)
    73. deseq_normcount <- counts(dge, normalized = TRUE)
    74. test_normcount <- sweep(raw_counts_wn, 2, sizeFactors(dge), "/")
    75. sum(test_normcount != deseq_normcount)
    76. pseudo_deseq <- log2(deseq_normcount + 1)
    77. df_deseq <- melt(pseudo_deseq, id = rownames(raw_counts_wn))
    78. names(df_deseq)[1:2]<- c("id", "sample")
    79. df_deseq$method <- rep("DESeq (RLE)", nrow(df_raw))
    80. #--edgeR--
    81. dge2 <- DGEList(raw_counts_wn)
    82. dge2
    83. dge2$samples
    84. #--Total count--
    85. pseudo_TC <- log2(cpm(dge2) + 1)
    86. df_TC <- melt(pseudo_TC, id = rownames(raw_counts_wn))
    87. names(df_TC)[1:2] <- c ("id", "sample")
    88. df_TC$method <- rep("TC", nrow(df_TC))
    89. ##--RPKM--
    90. #gene_lengths_wn <- gene_lengths[rowSums(raw_counts) > 0]
    91. #pseudo_RPKM <- log2(rpkm(dge2, gene.length = gene_lengths_wn) + 1)
    92. #df_RPKM <- melt(pseudo_RPKM, id = rownames(raw_counts_wn))
    93. #names(df_RPKM)[1:2] <- c ("id", "sample")
    94. #df_RPKM$method <- rep("RPKM", nrow(df_RPKM))
    95. #--Upper quartile--
    96. dge2 <- calcNormFactors(dge2, method = "upperquartile")
    97. dge2$samples
    98. test_normcount <- sweep(dge2$counts, 2,
    99. dge2$samples$lib.size*dge2$samples$norm.factors / 10^6,
    100. "/")
    101. range(as.vector(test_normcount - cpm(dge2)))
    102. pseudo_UQ <- log2(cpm(dge2) + 1)
    103. df_UQ <- melt(pseudo_UQ, id = rownames(raw_counts_wn))
    104. names(df_UQ)[1:2] <- c ("id", "sample")
    105. df_UQ$method <- rep("UQ", nrow(df_UQ))
    106. #--TMM--
    107. dge2 <- calcNormFactors(dge2, method = "TMM")
    108. dge2$samples
    109. pseudo_TMM <- log2(cpm(dge2) + 1)
    110. df_TMM <- melt(pseudo_TMM, id = rownames(raw_counts_wn))
    111. names(df_TMM)[1:2] <- c ("id", "sample")
    112. #MODIFIED!
    113. df_TMM$method <- rep("DESeq (RLE)", nrow(df_TMM)) #TMM
    114. #--Comparison--
    115. png("normalization.png", width=800, height=600)
    116. #df_allnorm <- rbind(df_raw, df_deseq, df_TC, df_UQ, df_TMM)
    117. #df_allnorm$method <- factor(df_allnorm$method, levels = c("Raw counts", "DESeq (RLE)", "TC", "TMM", "UQ"))
    118. df_allnorm <- rbind(df_raw, df_TMM)
    119. df_allnorm$method <- factor(df_allnorm$method, levels = c("Raw counts", "DESeq (RLE)"))
    120. p <- ggplot(data=df_allnorm, aes(x=sample, y=value, fill=method))
    121. p <- p + geom_boxplot()
    122. p <- p + theme_bw()
    123. p <- p + ggtitle("Boxplots of normalized pseudo counts\n
    124. for all samples by normalization methods")
    125. p <- p + facet_grid(. ~ method)
    126. p <- p + ylab(expression(log[2] ~ (normalized ~ count + 1))) + xlab("")
    127. p <- p + theme(title = element_text(size=10), axis.text.x = element_blank(),
    128. axis.ticks.x = element_blank())
    129. print(p)
    130. dev.off()
  5. select the differentially expressed genes

    1. #https://galaxyproject.eu/posts/2020/08/22/three-steps-to-galaxify-your-tool/
    2. #https://www.biostars.org/p/282295/
    3. #https://www.biostars.org/p/335751/
    4. #> condition
    5. # [1] control control HSV.d2 HSV.d2 HSV.d4 HSV.d4 HSV.d6 HSV.d6 HSV.d8 HSV.d8
    6. #Levels: control HSV.d2 HSV.d4 HSV.d6 HSV.d8
    7. #CONSOLE: mkdir /home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/results_chrHsv1_downstream/star_salmon/degenes
    8. setwd("/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/results_chrHsv1_downstream/star_salmon/degenes")
    9. #---- relevel to control ----
    10. dds$condition <- relevel(dds$condition, "control")
    11. dds = DESeq(dds, betaPrior=FALSE)
    12. resultsNames(dds)
    13. clist <- c("HSV.d2_vs_control","HSV.d4_vs_control", "HSV.d6_vs_control", "HSV.d8_vs_control")
    14. dds$condition <- relevel(dds$condition, "HSV.d2")
    15. dds = DESeq(dds, betaPrior=FALSE)
    16. resultsNames(dds)
    17. clist <- c("HSV.d4_vs_HSV.d2", "HSV.d6_vs_HSV.d2", "HSV.d8_vs_HSV.d2")
    18. dds$condition <- relevel(dds$condition, "HSV.d4")
    19. dds = DESeq(dds, betaPrior=FALSE)
    20. resultsNames(dds)
    21. clist <- c("HSV.d6_vs_HSV.d4", "HSV.d8_vs_HSV.d4")
    22. dds$condition <- relevel(dds$condition, "HSV.d6")
    23. dds = DESeq(dds, betaPrior=FALSE)
    24. resultsNames(dds)
    25. clist <- c("HSV.d8_vs_HSV.d6")
    26. for (i in clist) {
    27. contrast = paste("condition", i, sep="_")
    28. res = results(dds, name=contrast)
    29. res <- res[!is.na(res$log2FoldChange),]
    30. res_df <- as.data.frame(res)
    31. write.csv(as.data.frame(res_df[order(res_df$pvalue),]), file = paste(i, "all.txt", sep="-"))
    32. up <- subset(res_df, padj<=0.05 & log2FoldChange>=1)
    33. down <- subset(res_df, padj<=0.05 & log2FoldChange<=-1)
    34. write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
    35. write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
    36. }
    37. echo "contrast = paste(\"condition\", \"${i}\", sep=\"_\")"
    38. echo "res = results(dds, name=contrast)"
    39. #echo "res <- res[!is.na(res$log2FoldChange),]"
    40. echo "res <- na.omit(res)"
    41. ##https://github.com/kevinblighe/EnhancedVolcano
    42. #BiocManager::install("EnhancedVolcano")
    43. library("EnhancedVolcano")
    44. for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do
    45. #for i in HSV.d8_vs_control; do
    46. echo "res <- read.csv(file = paste(\"${i}\", \"all.txt\", sep=\"-\"), row.names = 1)"
    47. echo "res_df <- as.data.frame(res)"
    48. echo "png(\"${i}.png\",width=800, height=600)"
    49. #legendPosition = 'right',legendLabSize = 12, arrowheads = FALSE,
    50. #echo "EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=1.2, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression(~Delta*\"$(echo $i | cut -d'_' -f1) versus \" *~Delta*\"$(echo $i | cut -d'_' -f3)\"))"
    51. echo "EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=1.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression(\"$(echo $i | cut -d'_' -f1) versus $(echo $i | cut -d'_' -f3)\"))"
    52. echo "dev.off()"
    53. done
    54. #DEBUG: why some genes in HSV.d8 in control high regulated --> ERROR! We should keep the number of reads in the raw counts, leading all genes low regulated! Not using the default normalization method!!!!
    55. #res <- read.csv(file = paste("HSV.d8_vs_control", "all.txt", sep="-"), row.names = 1)
    56. #res_df <- as.data.frame(res)
    57. #png("HSV.d8_vs_control.png",width=800, height=600)
    58. #EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=1.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression("HSV.d8 versus control"))
    59. #dev.off()
    60. #under DIR degenes under KONSOLE
    61. for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do echo "~/Tools/csv2xls-0.4/csv_to_xls.py ${i}-all.txt ${i}-up.txt ${i}-down.txt -d$',' -o ${i}.xls;"; done
  6. clustering the genes and draw heatmap

    1. for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id"; echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id"; done
    2. cat *.id | sort -u > ids
    3. #add Gene_Id in the first line, delete the ""
    4. GOI <- read.csv("ids")$Gene_Id
    5. RNASeq.NoCellLine <- assay(rld)
    6. #install.packages("gplots")
    7. library("gplots")
    8. #clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC). pearson or spearman
    9. #datamat = RNASeq.NoCellLine[GOI, ]
    10. datamat = RNASeq.NoCellLine
    11. write.csv(as.data.frame(datamat), file ="gene_expressions.txt")
    12. constant_rows <- apply(datamat, 1, function(row) var(row) == 0)
    13. if(any(constant_rows)) {
    14. cat("Removing", sum(constant_rows), "constant rows.\n")
    15. datamat <- datamat[!constant_rows, ]
    16. }
    17. hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
    18. hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
    19. mycl = cutree(hr, h=max(hr$height)/1.5)
    20. mycol = c("YELLOW", "BLUE", "ORANGE", "MAGENTA", "CYAN", "RED", "GREEN", "MAROON", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN", "LIGHTRED", "LIGHTGREEN");
    21. mycol = mycol[as.vector(mycl)]
    22. #png("DEGs_heatmap.png", width=900, height=800)
    23. #cex.lab=10, labRow="",
    24. png("DEGs_heatmap.png", width=900, height=1000)
    25. heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
    26. scale='row',trace='none',col=bluered(75),
    27. RowSideColors = mycol, margins=c(10,20), cexRow=1.5, srtCol=45, lhei = c(2, 8)) #rownames(datamat)
    28. #heatmap.2(datamat, Rowv=as.dendrogram(hr), col=bluered(75), scale="row", RowSideColors=mycol, trace="none", margin=c(5,5), sepwidth=c(0,0), dendrogram = 'row', Colv = 'false', density.info='none', labRow="", srtCol=30, lhei=c(0.1,2))
    29. dev.off()
    30. #### cluster members #####
    31. write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt')
    32. write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt')
    33. write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt')
    34. #~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.txt -d',' -o DEGs_heatmap_cluster_members.xls
    35. ~/Tools/csv2xls-0.4/csv_to_xls.py \
    36. significant_gene_expressions.txt \
    37. -d',' -o DEGs_heatmap_expression_data.xls;

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum