articles list

import data and pca-plot

# Import the required libraries
library("AnnotationDbi")
library("clusterProfiler")
library("ReactomePA")
library(gplots)
library(tximport)
library(DESeq2)
setwd("~/DATA/Data_Manja_RNAseq_Organoids_Virus/results_chrHsv1_downstream/star_salmon")
# Define paths to your Salmon output quantification files
files <- c("control_r1" = "./control_r1/quant.sf",
          "control_r2" = "./control_r2/quant.sf",
          "HSV.d2_r1" = "./HSV.d2_r1/quant.sf",
          "HSV.d2_r2" = "./HSV.d2_r2/quant.sf",
          "HSV.d4_r1" = "./HSV.d4_r1/quant.sf",
          "HSV.d4_r2" = "./HSV.d4_r2/quant.sf",
          "HSV.d6_r1" = "./HSV.d6_r1/quant.sf",
          "HSV.d6_r2" = "./HSV.d6_r2/quant.sf",
          "HSV.d8_r1" = "./HSV.d8_r1/quant.sf",
          "HSV.d8_r2" = "./HSV.d8_r2/quant.sf")
# Import the transcript abundance data with tximport
txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE)
# Define the replicates and condition of the samples
replicate <- factor(c("r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2"))
condition <- factor(c("control", "control", "HSV.d2", "HSV.d2", "HSV.d4", "HSV.d4", "HSV.d6", "HSV.d6", "HSV.d8", "HSV.d8"))
# Define the colData for DESeq2
colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
# Create DESeqDataSet object
dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
# In the context of your new code which is using tximport and DESeq2, you don't necessarily need this step. The reason is that DESeq2 performs its own filtering of low-count genes during the normalization and differential expression steps.
# Filter data to retain only genes with more than 2 counts > 3 across all samples
# dds <- dds[rowSums(counts(dds) > 3) > 2, ]
# Output raw count data to a CSV file
write.csv(counts(dds), file="transcript_counts.csv")
# -- gene-level count data --
# Read in the tx2gene map from salmon_tx2gene.tsv
#tx2gene <- read.csv("salmon_tx2gene.tsv", sep="\t", header=FALSE)
tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE)
# Set the column names
colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name")
# Remove the gene_name column if not needed
tx2gene <- tx2gene[,1:2]
# Import and summarize the Salmon data with tximport
txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut = FALSE)
# Continue with the DESeq2 workflow as before...
colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
#dds <- dds[rowSums(counts(dds) > 3) > 2, ]    #60605-->26543
write.csv(counts(dds, normalized=FALSE), file="gene_counts.csv")
#~/Tools/csv2xls-0.4/csv_to_xls.py gene_counts.csv -d',' -o gene_counts.xls
#TODO: why a lot of reads were removed due to the too_short?
#STAR --runThreadN 4 --genomeDir /path/to/GenomeDir --readFilesIn /path/to/read1.fastq /path/to/read2.fastq --outFilterMatchNmin 50 --outSAMtype BAM SortedByCoordinate --outFileNamePrefix /path/to/output
dim(counts(dds))
head(counts(dds), 10)
#DEBUG: DESeq should not used here!?
#TODO_NEXT_WEEK: rerun without fistly DESeq(dds) to compare if the results is the same to process_1 
#dds <- DESeq(dds)
rld <- rlogTransformation(dds)
# draw simple pca and heatmap
library(gplots) 
library("RColorBrewer")
#mat <- assay(rld)
#mm <- model.matrix(~condition, colData(rld))
#mat <- limma::removeBatchEffect(mat, batch=rld$batch, design=mm)
#assay(rld) <- mat
# -- pca --
png("pca.png", 1200, 800)
plotPCA(rld, intgroup=c("condition"))
dev.off()
# -- heatmap --
png("heatmap.png", 1200, 800)
distsRL <- dist(t(assay(rld)))
mat <- as.matrix(distsRL)
hc <- hclust(distsRL)
hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
dev.off()

draw 3D PCA plots.

library(gplots) 
library("RColorBrewer")
library(ggplot2)
data <- plotPCA(rld, intgroup=c("condition", "replicate"), returnData=TRUE)
write.csv(data, file="plotPCA_data.csv")
#calculate all PCs including PC3 with the following codes
library(genefilter)
ntop <- 500
rv <- rowVars(assay(rld))
select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
mat <- t( assay(rld)[select, ] )
pc <- prcomp(mat)
pc$x[,1:3]
#df_pc <- data.frame(pc$x[,1:3])
df_pc <- data.frame(pc$x)
identical(rownames(data), rownames(df_pc)) #-->TRUE
data$PC1 <- NULL
data$PC2 <- NULL
merged_df <- merge(data, df_pc, by = "row.names")
#merged_df <- merged_df[, -1]
row.names(merged_df) <- merged_df$Row.names
merged_df$Row.names <- NULL  # remove the "name" column
merged_df$name <- NULL
merged_df <- merged_df[, c("PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","PC9","PC10","group","condition","replicate")]
write.csv(merged_df, file="merged_df_10PCs.csv")
summary(pc)  
#0.5333  0.2125 0.06852
#0.8026 0.09042 0.06578
draw_3D.py
# adjust proportion to real values in the following plot
import plotly.graph_objects as go
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np
# Read in data as a pandas dataframe
#df = pd.DataFrame({
#    'PC1': [-13.999925, -12.504291, -12.443057, -13.065235, -17.316215],
#    'PC2': [-1.498823, -3.342411, -6.067055, -8.205809, 3.293993],
#    'PC3': [-3.335085, 15.207755, -14.725450, 15.078469, -6.917358],
#    'condition': ['GFP d3', 'GFP d3', 'GFP d8', 'GFP d8', 'GFP+mCh d9/12'],
#    'replicate': ['DI', 'DII', 'DI', 'DII', 'DI']
#})
df = pd.read_csv('merged_df_10PCs.csv', index_col=0, header=0)
df['condition'] = df['condition'].replace("control", "control")
df['condition'] = df['condition'].replace("HSV.d2", "day 2")
df['condition'] = df['condition'].replace("HSV.d4", "day 4")
df['condition'] = df['condition'].replace("HSV.d6", "day 6")
df['condition'] = df['condition'].replace("HSV.d8", "day 8")
# Fit PCA model to reduce data dimensions to 3
pca = PCA(n_components=3)
pca.fit(df.iloc[:, :-3])
X_reduced = pca.transform(df.iloc[:, :-3])
# Get variance ratios
explained_variance_ratio = pca.explained_variance_ratio_
# Add reduced data back to dataframe
df['PC1'] = X_reduced[:, 0]
df['PC2'] = X_reduced[:, 1]
df['PC3'] = X_reduced[:, 2]
# Create PCA plot with 3D scatter
fig = go.Figure()
##ff7f00
condition_color_map = {
    'control': 'rgb(100, 100, 100)',
    'day 2': '#33a02c',
    'day 4': '#1f78b4',
    'day 6': '#e31a1c',
    'day 8': 'magenta'
}
replicate_symbol_map = {'r1': 'circle', 'r2': 'diamond'}
for replicate, replicate_symbol in replicate_symbol_map.items():
    for condition, condition_color in condition_color_map.items():
        mask = (df['condition'] == condition) & (df['replicate'] == replicate)
        fig.add_trace(go.Scatter3d(x=df.loc[mask, 'PC1'], y=df.loc[mask, 'PC2'], z=df.loc[mask, 'PC3'],
                                    mode='markers',
                                    name=f'{condition}' if replicate == 'r1' else None,
                                    legendgroup=f'{condition}',
                                    showlegend=True if replicate == 'r1' else False,
                                    marker=dict(size=6 if replicate_symbol in ['diamond'] else 10, opacity=0.8, color=condition_color, symbol=replicate_symbol)))
for replicate, replicate_symbol in replicate_symbol_map.items():
    fig.add_trace(go.Scatter3d(x=[None], y=[None], z=[None],
                                mode='markers',
                                name=replicate,
                                legendgroup=f'{replicate}',
                                showlegend=True,
                                marker=dict(size=10, opacity=1, color='black', symbol=replicate_symbol),
                                hoverinfo='none'))
# Annotations for the legend blocks
#TODO: calculate the PC values.
#TODO: adjust the axis length according to the actual size of axis!
fig.update_layout(
    annotations=[
        dict(x=1.1, y=1.0, xref='paper', yref='paper', showarrow=False,
              text='Condition', font=dict(size=15)),
        dict(x=1.1, y=0.6, xref='paper', yref='paper', showarrow=False,
              text='Replicate', font=dict(size=15))
    ],
    scene=dict(
        #aspectmode='cube',
        #xaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC1: 53% v.', scaleratio=0.53),
        #yaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC2: 21% v.', scaleratio=0.21),
        #zaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC3: 7% variance', scaleratio=0.07),
        aspectmode='manual',
        aspectratio=dict(x=explained_variance_ratio[0], y=explained_variance_ratio[1], z=explained_variance_ratio[2]),
        xaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC1: 53% v.', range=[min(df['PC1']), max(df['PC1'])]),
        yaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC2: 21% v.', range=[min(df['PC2']), max(df['PC2'])]),
        zaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC3: 7% variance', range=[min(df['PC3']), max(df['PC3'])]),
        bgcolor='white'
    ),
    margin=dict(l=5, r=5, b=5, t=0)  # Adjust the margins to prevent clipping of axis titles
)
#fig.show()
fig.write_image("fig1.svg")
fig.update_layout(
    annotations=[
        dict(x=1.1, y=1.0, xref='paper', yref='paper', showarrow=False,
            text='Condition', font=dict(size=15)),
        dict(x=1.1, y=0.6, xref='paper', yref='paper', showarrow=False,
            text='Donor', font=dict(size=15)),
        dict(x=1.08, y=0.2, xref='paper', yref='paper', showarrow=False,
            text=f'PC3: {explained_variance_ratio[2]*100:.2f}% v.', font=dict(size=15), textangle=-90)
    ],
    scene=dict(
        aspectmode='manual',
        aspectratio=dict(x=explained_variance_ratio[0]*2, y=explained_variance_ratio[1]*2, z=explained_variance_ratio[2]*2),
        #, range=[min(df['PC1']), max(df['PC1'])]
        #, range=[min(df['PC2']), max(df['PC2'])]
        #, range=[min(df['PC3']), max(df['PC3'])]
        xaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title=f'PC1: {explained_variance_ratio[0]*100:.2f}% variance'),
        yaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title=f'PC2: {explained_variance_ratio[1]*100:.2f}% v.'),
        zaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title=''),
        bgcolor='white'
    ),
    margin=dict(l=0, r=0, b=0, t=0)  # Adjust the margins to prevent clipping of axis titles
)
fig.write_image("PCA_3D.svg")
#/usr/bin/convert g235.png -crop 3250x1680+1+750 PCA_3D_.png

(optional) estimate size factors

> head(dds)
class: DESeqDataSet 
dim: 6 10 
metadata(1): version
assays(6): counts avgTxLength ... H cooks
rownames(6): ENSG00000000003 ENSG00000000005 ... ENSG00000000460
  ENSG00000000938
rowData names(34): baseMean baseVar ... deviance maxCooks
colnames(10): control_r1 control_r2 ... HSV.d8_r1 HSV.d8_r2
colData names(2): condition replicate
#convert bam to bigwig using deepTools by feeding inverse of DESeq’s size Factor
sizeFactors(dds)
#NULL
dds <- estimateSizeFactors(dds)
> sizeFactors(dds)
raw_counts <- counts(dds)
normalized_counts <- counts(dds, normalized=TRUE)
#write.table(raw_counts, file="raw_counts.txt", sep="\t", quote=F, col.names=NA)
#write.table(normalized_counts, file="normalized_counts.txt", sep="\t", quote=F, col.names=NA)
# ---- DEBUG sizeFactors(dds) always NULL, see https://support.bioconductor.org/p/97676/ ----
nm <- assays(dds)[["avgTxLength"]]
sf <- estimateSizeFactorsForMatrix(counts(dds), normMatrix=nm)
assays(dds)$counts  # for count data
assays(dds)$avgTxLength  # for average transcript length, etc.
assays(dds)$normalizationFactors
In normal circumstances, the size factors should be stored in the DESeqDataSet object itself and not in the assays, so they are typically not retrievable via the assays() function. However, due to the issues you're experiencing, you might be able to manually compute the size factors and assign them back to the DESeqDataSet.
To calculate size factors manually, DESeq2 uses the median ratio method. Here's a very simplified version of how you could compute this manually:
> assays(dds)
List of length 6
names(6): counts avgTxLength normalizationFactors mu H cooks
To calculate size factors manually, DESeq2 uses the median ratio method. Here's a very simplified version of how you could compute this manually:
geoMeans <- apply(assays(dds)$counts, 1, function(row) if (all(row == 0)) 0 else exp(mean(log(row[row != 0]))))
sizeFactors(dds) <- median(assays(dds)$counts / geoMeans, na.rm = TRUE)
# ---- DEBUG END ----
#unter konsole
#  control_r1  ...
# 1/0.9978755  ...
> sizeFactors(dds)
                    HeLa_TO_r1                      HeLa_TO_r2 
                      0.9978755                       1.1092227
1/0.9978755=1.002129023
1/1.1092227=
#bamCoverage --bam ../markDuplicates/${sample}Aligned.sortedByCoord.out.bam -o ${sample}_norm.bw --binSize 10 --scaleFactor  --effectiveGenomeSize 2864785220
bamCoverage --bam ../markDuplicates/HeLa_TO_r1Aligned.sortedByCoord.out.markDups.bam -o HeLa_TO_r1.bw --binSize 10 --scaleFactor 1.002129023     --effectiveGenomeSize 2864785220
bamCoverage --bam ../markDuplicates/HeLa_TO_r2Aligned.sortedByCoord.out.markDups.bam -o HeLa_TO_r2.bw --binSize 10 --scaleFactor  0.901532217        --effectiveGenomeSize 2864785220

compare the normalization methods

# ---- draw normalization before and after ----
### Let's implement such a function
### cds is a countDataset
estimSf <- function (cds){
    # Get the count matrix
    cts <- counts(cds)
    # Compute the geometric mean
    geomMean <- function(x) prod(x)^(1/length(x))
    # Compute the geometric mean over the line
    gm.mean  <-  apply(cts, 1, geomMean)
    # Zero values are set to NA (avoid subsequentcdsdivision by 0)
    gm.mean[gm.mean == 0] <- NA
    # Divide each line by its corresponding geometric mean
    # sweep(x, MARGIN, STATS, FUN = "-", check.margin = TRUE, ...)
    # MARGIN: 1 or 2 (line or columns)
    # STATS: a vector of length nrow(x) or ncol(x), depending on MARGIN
    # FUN: the function to be applied
    cts <- sweep(cts, 1, gm.mean, FUN="/")
    # Compute the median over the columns
    med <- apply(cts, 2, median, na.rm=TRUE)
    # Return the scaling factor
    return(med)
}
#https://dputhier.github.io/ASG/practicals/rnaseq_diff_Snf2/rnaseq_diff_Snf2.html
#http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#data-transformations-and-visualization
#https://hbctraining.github.io/DGE_workshop/lessons/02_DGE_count_normalization.html
#https://hbctraining.github.io/DGE_workshop/lessons/04_DGE_DESeq2_analysis.html
#https://genviz.org/module-04-expression/0004/02/01/DifferentialExpression/
#DESeq2’s median of ratios [1]
#EdgeR’s trimmed mean of M values (TMM) [2]
#http://www.nathalievialaneix.eu/doc/html/TP1_normalization.html  #very good website!
test_normcount <- sweep(raw_counts, 2, sizeFactors(dds), "/")
sum(test_normcount != normalized_counts)
head(estimSf(dds))
all(round(estimSf(dds),6) == round(sizeFactors(dds), 6))
## Checking the normalization
png("normalization.png", width=800, height=600)
epsilon <- 1 # pseudo-count to avoid problems with log(0)
par(mfrow=c(1,2),cex.lab=0.7)
boxplot(log2(raw_counts+epsilon), cex.axis=0.7, las=1, xlab="log2(raw counts)", horizontal=TRUE, main="Raw counts")
boxplot(log2(normalized_counts+epsilon), cex.axis=0.7, las=1, xlab="log2(normalized counts)", horizontal=TRUE, main="Normalized counts") 
#boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2)
#plotDensity(log2(counts(dds.norm)+epsilon),  col=col.pheno.selected, 
#            xlab="log2(counts)", cex.lab=0.7, panel.first=grid()) 
#plotDensity(log2(counts(dds.norm, normalized=TRUE)+epsilon), col=col.pheno.selected, 
#            xlab="log2(normalized counts)", cex.lab=0.7, panel.first=grid()) 
dev.off()
# since we Gene-level differential expression analysis with DESeq2, the splicing plays no role in the analysis!
# 用nanopore 可以 compare transcript length distribution. 有可能Cellline很长，Extracellular vesicles (EVs)很短!
library(ggplot2)
library(gridExtra)
library(reshape2)
library(mixOmics)
library(RColorBrewer)
library(DESeq)
library(edgeR)
library(VennDiagram)
library(devtools)
raw_counts_wn <- raw_counts[rowSums(raw_counts) > 0, ]
dim(raw_counts_wn)
#--Raw counts--
pseudo_counts <- log2(raw_counts_wn + 1)
head(pseudo_counts)
df_raw <- melt(pseudo_counts, id = rownames(raw_counts_wn))
names(df_raw)[1:2]<- c("id", "sample")
df_raw$method <- rep("Raw counts", nrow(df_raw))  
head(df_raw)
#--DESeq--
cData = data.frame(row.names=colnames(raw_counts_wn), replicates=replicates, ids=ids)
dge<-DESeqDataSetFromMatrix(countData=raw_counts_wn, colData=cData, design=~replicates)
dge <- estimateSizeFactors(dge)
sizeFactors(dge)
deseq_normcount <- counts(dge, normalized = TRUE)
test_normcount <- sweep(raw_counts_wn, 2, sizeFactors(dge), "/")
sum(test_normcount != deseq_normcount)
pseudo_deseq <- log2(deseq_normcount + 1)
df_deseq <- melt(pseudo_deseq, id = rownames(raw_counts_wn))
names(df_deseq)[1:2]<- c("id", "sample")
df_deseq$method <- rep("DESeq (RLE)", nrow(df_raw))
#--edgeR--
dge2 <- DGEList(raw_counts_wn)
dge2
dge2$samples
#--Total count--
pseudo_TC <- log2(cpm(dge2) + 1)
df_TC <- melt(pseudo_TC, id = rownames(raw_counts_wn))
names(df_TC)[1:2] <- c ("id", "sample")
df_TC$method <- rep("TC", nrow(df_TC))
##--RPKM--
#gene_lengths_wn <- gene_lengths[rowSums(raw_counts) > 0]
#pseudo_RPKM <- log2(rpkm(dge2, gene.length = gene_lengths_wn) + 1)
#df_RPKM <- melt(pseudo_RPKM, id = rownames(raw_counts_wn))
#names(df_RPKM)[1:2] <- c ("id", "sample")
#df_RPKM$method <- rep("RPKM", nrow(df_RPKM))
#--Upper quartile--
dge2 <- calcNormFactors(dge2, method = "upperquartile")
dge2$samples
test_normcount <- sweep(dge2$counts, 2,
                        dge2$samples$lib.size*dge2$samples$norm.factors / 10^6,
                        "/")
range(as.vector(test_normcount - cpm(dge2)))
pseudo_UQ <- log2(cpm(dge2) + 1)
df_UQ <- melt(pseudo_UQ, id = rownames(raw_counts_wn))
names(df_UQ)[1:2] <- c ("id", "sample")
df_UQ$method <- rep("UQ", nrow(df_UQ))
#--TMM--
dge2 <- calcNormFactors(dge2, method = "TMM")
dge2$samples
pseudo_TMM <- log2(cpm(dge2) + 1)
df_TMM <- melt(pseudo_TMM, id = rownames(raw_counts_wn))
names(df_TMM)[1:2] <- c ("id", "sample")
#MODIFIED!
df_TMM$method <- rep("DESeq (RLE)", nrow(df_TMM))  #TMM
#--Comparison--
png("normalization.png", width=800, height=600)
#df_allnorm <- rbind(df_raw, df_deseq, df_TC, df_UQ, df_TMM)
#df_allnorm$method <- factor(df_allnorm$method, levels = c("Raw counts", "DESeq (RLE)", "TC",  "TMM", "UQ"))
df_allnorm <- rbind(df_raw, df_TMM)
df_allnorm$method <- factor(df_allnorm$method, levels = c("Raw counts", "DESeq (RLE)"))
p <- ggplot(data=df_allnorm, aes(x=sample, y=value, fill=method))
p <- p + geom_boxplot()  
p <- p + theme_bw()
p <- p + ggtitle("Boxplots of normalized pseudo counts\n
for all samples by normalization methods")
p <- p + facet_grid(. ~ method) 
p <- p + ylab(expression(log[2] ~ (normalized ~ count + 1))) + xlab("")
p <- p + theme(title = element_text(size=10), axis.text.x = element_blank(), 
              axis.ticks.x = element_blank())
print(p)
dev.off()

select the differentially expressed genes

#https://galaxyproject.eu/posts/2020/08/22/three-steps-to-galaxify-your-tool/
#https://www.biostars.org/p/282295/
#https://www.biostars.org/p/335751/
#> condition
# [1] control control HSV.d2  HSV.d2  HSV.d4  HSV.d4  HSV.d6  HSV.d6  HSV.d8  HSV.d8 
#Levels: control HSV.d2 HSV.d4 HSV.d6 HSV.d8
#CONSOLE: mkdir /home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/results_chrHsv1_downstream/star_salmon/degenes
setwd("/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/results_chrHsv1_downstream/star_salmon/degenes")
#---- relevel to control ----
dds$condition <- relevel(dds$condition, "control")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("HSV.d2_vs_control","HSV.d4_vs_control", "HSV.d6_vs_control", "HSV.d8_vs_control")
dds$condition <- relevel(dds$condition, "HSV.d2")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("HSV.d4_vs_HSV.d2", "HSV.d6_vs_HSV.d2", "HSV.d8_vs_HSV.d2")
dds$condition <- relevel(dds$condition, "HSV.d4")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("HSV.d6_vs_HSV.d4", "HSV.d8_vs_HSV.d4")
dds$condition <- relevel(dds$condition, "HSV.d6")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("HSV.d8_vs_HSV.d6")
for (i in clist) {
  contrast = paste("condition", i, sep="_")
  res = results(dds, name=contrast)
  res <- res[!is.na(res$log2FoldChange),]
  res_df <- as.data.frame(res)
  write.csv(as.data.frame(res_df[order(res_df$pvalue),]), file = paste(i, "all.txt", sep="-"))
  up <- subset(res_df, padj<=0.05 & log2FoldChange>=1)
  down <- subset(res_df, padj<=0.05 & log2FoldChange<=-1)
  write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
  write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
}
  echo "contrast = paste(\"condition\", \"${i}\", sep=\"_\")"
  echo "res = results(dds, name=contrast)"
  #echo "res <- res[!is.na(res$log2FoldChange),]"
  echo "res <- na.omit(res)"
##https://github.com/kevinblighe/EnhancedVolcano
#BiocManager::install("EnhancedVolcano")
library("EnhancedVolcano")
for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do
#for i in HSV.d8_vs_control; do
  echo "res <- read.csv(file = paste(\"${i}\", \"all.txt\", sep=\"-\"), row.names = 1)"
  echo "res_df <- as.data.frame(res)"
  echo "png(\"${i}.png\",width=800, height=600)"
  #legendPosition = 'right',legendLabSize = 12,  arrowheads = FALSE,
  #echo "EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=1.2, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression(~Delta*\"$(echo $i | cut -d'_' -f1) versus \" *~Delta*\"$(echo $i | cut -d'_' -f3)\"))"
  echo "EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=1.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression(\"$(echo $i | cut -d'_' -f1) versus $(echo $i | cut -d'_' -f3)\"))"
  echo "dev.off()"
done
#DEBUG: why some genes in HSV.d8 in control high regulated --> ERROR! We should keep the number of reads in the raw counts, leading all genes low regulated! Not using the default normalization method!!!!
#res <- read.csv(file = paste("HSV.d8_vs_control", "all.txt", sep="-"), row.names = 1)
#res_df <- as.data.frame(res)
#png("HSV.d8_vs_control.png",width=800, height=600)
#EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=1.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression("HSV.d8 versus control"))
#dev.off()
#under DIR degenes under KONSOLE
for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do echo "~/Tools/csv2xls-0.4/csv_to_xls.py ${i}-all.txt ${i}-up.txt ${i}-down.txt -d$',' -o ${i}.xls;"; done

clustering the genes and draw heatmap

for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id"; echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id"; done
cat *.id | sort -u > ids
#add Gene_Id in the first line, delete the ""
GOI <- read.csv("ids")$Gene_Id
RNASeq.NoCellLine <- assay(rld)
#install.packages("gplots")
library("gplots")
#clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).  pearson or spearman
#datamat = RNASeq.NoCellLine[GOI, ]
datamat = RNASeq.NoCellLine
write.csv(as.data.frame(datamat), file ="gene_expressions.txt")
constant_rows <- apply(datamat, 1, function(row) var(row) == 0)
if(any(constant_rows)) {
  cat("Removing", sum(constant_rows), "constant rows.\n")
  datamat <- datamat[!constant_rows, ]
}
hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
mycl = cutree(hr, h=max(hr$height)/1.5)
mycol = c("YELLOW", "BLUE", "ORANGE", "MAGENTA", "CYAN", "RED", "GREEN", "MAROON", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN", "LIGHTRED", "LIGHTGREEN");
mycol = mycol[as.vector(mycl)]
#png("DEGs_heatmap.png", width=900, height=800)
#cex.lab=10, labRow="",
png("DEGs_heatmap.png", width=900, height=1000)
heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
            scale='row',trace='none',col=bluered(75), 
            RowSideColors = mycol, margins=c(10,20), cexRow=1.5, srtCol=45, lhei = c(2, 8))  #rownames(datamat)  
#heatmap.2(datamat, Rowv=as.dendrogram(hr), col=bluered(75), scale="row", RowSideColors=mycol, trace="none", margin=c(5,5), sepwidth=c(0,0), dendrogram = 'row', Colv = 'false', density.info='none', labRow="", srtCol=30, lhei=c(0.1,2))
dev.off()
#### cluster members #####
write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt')
write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt') 
write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt')  
#~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.txt -d',' -o DEGs_heatmap_cluster_members.xls
~/Tools/csv2xls-0.4/csv_to_xls.py \
significant_gene_expressions.txt \
-d',' -o DEGs_heatmap_expression_data.xls;

RNA-seq skin organoids on GRCh38+chrHsv1

本文有评论

看文章，发评论，不要沉默

最受欢迎文章

最新文章

最多评论文章

推荐相似文章