Analysis of the RNA binding protein motifs for RNA Seq

gene_x 0 like s 14 view s

Tags: pipeline

Analysis of the RNA binding protein motifs for RNA Seq

TODOs_NEXTWEEK: how to use RBPmap results to enrich the significant RBPs for RNA-seq (3' UTRs?) and miRNAs?

    🧬 1. RBPmap (Web + Command-line)

        Type: Web tool and downloadable Perl scripts

        Purpose: Scans RNA sequences for known RBP motifs

        Database: Uses curated RBP motif sets (e.g., from literature, CLIP data)

        URL: https://rbpmap.technion.ac.il/

    ✅ Good for: Biologists without strong R background
    🚫 Not a package, but scriptable via CLI or HTML results
  1. filtering out RBP from the database

  2. improve the output of the 3utr_down_cleaned.fasta

  3. multiple testing

  4. perform this for miRNA.

http://xgenes.com/article/article-content/146/peak-and-motif-analyses-in-promoters/

http://xgenes.com/article/article-content/154/density-of-motif-plots-and-its-statistical-tests/

There are several alternative R packages and tools to perform motif enrichment analysis for RNA-binding proteins (RBPs), beyond PWMEnrich::motifEnrichment(). Here are the most notable ones:

| Tool / Package | Enrichment | Custom Motifs | CLI or R? | RNA-specific? |

| ------------------------ | ----------------- | --------------- | --------- | -------------- |

| PWMEnrich | ✅ | ✅ | R | ✅ |

| RBPmap | ✅ | ❌ (uses own db) | Web/CLI | ✅ | ----> try RBPmap_results + enrichments!

| Biostrings/TFBSTools | ❌ (only scanning) | ✅ | R | ❌ | #ATtRACT + Biostrings / TFBSTools

| rmap | ✅ (CLIP-based) | ❌ | R | ✅ |

| Homer | ✅ | ✅ | CLI | ⚠ RNA optional |

| MEME (AME, FIMO) | ✅ | ✅ | Web/CLI | ⚠ Generic |

  1. Using R to get 3UTR_sequences.fasta, 5UTR

    setwd("~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/motif_analysis")
    
    # === STEP 0: Load libraries ===
    if (!requireNamespace("biomaRt", quietly = TRUE)) install.packages("biomaRt")
    library(biomaRt)
    
    # === STEP 1: Load DEG files ===
    ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-up.txt    #20086
    ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-down.txt  #634
    ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/WaGa_wt.EV_vs_parental-up.txt     #23832
    ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/WaGa_wt.EV_vs_parental-down.txt   #375
    up_file <- "~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-up.txt"
    down_file <- "~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-down.txt"
    
    up_degs <- read.table(up_file, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
    down_degs <- read.table(down_file, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
    
    # === STEP 2: Clean & extract Ensembl gene IDs ===
    up_ids <- unique(na.omit(up_degs[[1]]))
    down_ids <- unique(na.omit(down_degs[[1]]))
    
    # Optional: protein-coding filter
    # up_ids <- unique(up_degs$external_gene_name[up_degs$gene_biotype == "protein_coding"])
    # down_ids <- unique(down_degs$external_gene_name[down_degs$gene_biotype == "protein_coding"])
    
    # === STEP 3: Connect to Ensembl Asia mirror ===
    ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl", host = "https://asia.ensembl.org")
    
    # === STEP 4: Robust, minimal-loss sequence fetch ===
    get_sequences <- function(ids, seqType, batch_size = 300, retry_depth = 3) {
    all_seqs <- list()
    failed_ids <- c()
    
    safe_batch_query <- function(batch_ids, depth = 1) {
        if (length(batch_ids) == 0) return(NULL)
    
        tryCatch({
        seqs <- getSequence(id = batch_ids,
                            type = "ensembl_gene_id",
                            seqType = seqType,
                            mart = ensembl)
        seqs <- seqs[seqs[[seqType]] != "", ]
        return(seqs)
        }, error = function(e) {
        if (depth < retry_depth && length(batch_ids) > 1) {
            # Split batch and retry each half
            split_batches <- split(batch_ids, ceiling(seq_along(batch_ids)/ceiling(length(batch_ids)/2)))
            result <- lapply(split_batches, safe_batch_query, depth = depth + 1)
            return(do.call(rbind, result))
        } else {
            message("❌ Final failure for ", length(batch_ids), " genes. Logging IDs.")
            failed_ids <<- c(failed_ids, batch_ids)
            return(NULL)
        }
        })
    }
    
    id_batches <- split(ids, ceiling(seq_along(ids) / batch_size))
    for (i in seq_along(id_batches)) {
        batch <- id_batches[[i]]
        cat(sprintf("    🔹 Batch %d/%d (%d genes)\n", i, length(id_batches), length(batch)))
        result <- safe_batch_query(batch)
        if (!is.null(result) && nrow(result) > 0) {
        all_seqs[[length(all_seqs) + 1]] <- result
        }
    }
    
    if (length(failed_ids) > 0) {
        writeLines(failed_ids, paste0("failed_ids_", seqType, ".txt"))
        message("⚠️ Failed IDs saved to failed_ids_", seqType, ".txt")
    }
    
    if (length(all_seqs) == 0) return(data.frame())
    return(do.call(rbind, all_seqs))
    }
    
    # === STEP 5: Write FASTA ===
    write_fasta <- function(sequences, names, file) {
    con <- file(file, "w")
    for (i in seq_along(sequences)) {
        cat(">", names[i], "\n", sequences[i], "\n", file = con, sep = "")
    }
    close(con)
    }
    
    # === STEP 6: Background sampling ===
    cat("📦 Fetching background gene list...\n")
    all_deg_ids <- unique(c(up_ids, down_ids))
    all_genes <- getBM(attributes = c("ensembl_gene_id"), mart = ensembl)
    bg_ids <- setdiff(all_genes$ensembl_gene_id, all_deg_ids)
    bg_sample <- sample(bg_ids, 1000)
    
    # === STEP 7: Fetch sequences by group and type ===
    seq_types <- c("3utr", "5utr", "cds", "transcript")
    groups <- list(
    up = up_ids,
    down = down_ids,
    background = bg_sample
    )
    
    for (seq_type in seq_types) {
    cat(sprintf("\n⏳ Fetching %s sequences...\n", seq_type))
    for (group_name in names(groups)) {
        cat(sprintf("  🔸 Group: %s\n", group_name))
        ids <- groups[[group_name]]
        seqs <- get_sequences(ids, seq_type)
        if (nrow(seqs) > 0) {
        out_file <- paste0(seq_type, "_", group_name, ".fasta")
        write_fasta(seqs[[seq_type]], seqs$ensembl_gene_id, out_file)
        cat("    ✅ Written to", out_file, "\n")
        } else {
        message("⚠️ No sequences found for ", group_name, " (", seq_type, ")")
        }
    }
    }
    
    cat("\n✅ All FASTA files created for: 3′UTR, 5′UTR, CDS, and Transcript.\n")
    
  2. Using https://rbpmap.technion.ac.il/1747409685/results.html, how to predict p-value for a specific RBP?

    #Try using RBPmap
    ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/motif_analysis/run2/3utr_down.fasta
    
  3. Significant RBP enrichments

    install.packages("BiocManager")
    BiocManager::install("BSgenome")
    BiocManager::install("Biostrings")
    BiocManager::install("motifmatchr")
    #BiocManager::install("MotifDb")
    #BiocManager::install("motifStack")
    BiocManager::install("TFBSTools")
    #install.packages("pheatmap")
    
    library(Matrix)
    library(pheatmap)
    library(Biostrings)
    library(motifmatchr)
    library(GenomicRanges)
    #library(MotifDb)
    #library(motifStack)
    library(TFBSTools)
    library(SummarizedExperiment)
    library(ggplot2)
    library(pheatmap)
    
    library(PWMEnrich)
    library(PWMEnrich.Hsapiens.background)
    
    # ------------- Methods -----------
    
    clean_fasta_file <- function(infile, outfile) {
    lines <- readLines(infile)
    
    # Initialize variables to store cleaned lines
    cleaned_lines <- list()
    add_line <- FALSE  # To track when we need to add a sequence
    
    for (line in lines) {
        # If the line is a header and contains "Sequence unavailable", skip it
        if (startsWith(line, ">") && grepl("Sequence unavailable", line)) {
        add_line <- FALSE  # Don't add this header or its sequence
        } else if (startsWith(line, ">")) {
        # If it's a new header, start adding the following sequence
        add_line <- TRUE
        cleaned_lines <- c(cleaned_lines, line)
        } else if (add_line) {
        # Add the sequence line if it follows a valid header
        cleaned_lines <- c(cleaned_lines, line)
        }
    }
    
    # Write cleaned lines to the output file
    writeLines(cleaned_lines, outfile)
    }
    
    # Function to clean sequences by replacing invalid characters with 'N'
    clean_invalid_sequences <- function(dna_string_set) {
    valid_bases <- c("A", "T", "C", "G", "N")  # Define valid DNA bases
    clean_sequences <- lapply(dna_string_set, function(seq) {
        # Remove invalid characters, replace them with 'N'
        seq <- gsub("[^ATCGN]", "N", as.character(seq))  # Replace non-ATCGN characters with 'N'
        DNAString(seq)  # Return as a valid DNAString object
    })
    return(DNAStringSet(clean_sequences))  # Return cleaned sequences as a DNAStringSet
    }
    
    # ============================================
    # 2. Read and clean 3′ UTR sequences
    # ============================================
    # Clean your FASTA file
    clean_fasta_file("run2/3utr_up.fasta", "run2/3utr_up_cleaned.fasta")
    clean_fasta_file("run2/3utr_down.fasta", "run2/3utr_down_cleaned.fasta")
    clean_fasta_file("run2/3utr_background.fasta", "run2/3utr_background_cleaned.fasta")
    clean_fasta_file("run2/5utr_up.fasta", "run2/5utr_up_cleaned.fasta")
    clean_fasta_file("run2/5utr_down.fasta", "run2/5utr_down_cleaned.fasta")
    clean_fasta_file("run2/5utr_background.fasta", "run2/5utr_background_cleaned.fasta")
    
    # Read in the cleaned FASTA files
    utr_3_up_raw <- readDNAStringSet("run2/3utr_up_cleaned.fasta")
    utr_3_down_raw <- readDNAStringSet("run2/3utr_down_cleaned.fasta")
    utr_3_background_raw <- readDNAStringSet("run2/3utr_background_cleaned.fasta")
    utr_5_up_raw <- readDNAStringSet("run2/5utr_up_cleaned.fasta")
    utr_5_down_raw <- readDNAStringSet("run2/5utr_down_cleaned.fasta")
    utr_5_background_raw <- readDNAStringSet("run2/5utr_background_cleaned.fasta")
    
    # Clean sequences
    utr_3_up <- clean_invalid_sequences(utr_3_up_raw)
    utr_3_down <- clean_invalid_sequences(utr_3_down_raw)
    utr_3_background <- clean_invalid_sequences(utr_3_background_raw)
    utr_5_up <- clean_invalid_sequences(utr_5_up_raw)
    utr_5_down <- clean_invalid_sequences(utr_5_down_raw)
    utr_5_background <- clean_invalid_sequences(utr_5_background_raw)
    
    # Filter out sequences shorter than the shortest PWM (e.g. 6 bp)
    # and any sequences containing ambiguous bases (non‑ACGT)
    min_pwm_length <- 6
    utr_3_down_clean <- utr_3_down[
    width(utr_3_down) >= min_pwm_length &
    !grepl("[^ACGT]", as.character(utr_3_down))
    ]
    #cat("Kept", length(utr_down_clean), "sequences after cleaning.\n")
    utr_3_background_clean <- utr_3_background[
    width(utr_3_background) >= min_pwm_length &
    !grepl("[^ACGT]", as.character(utr_3_background))
    ]
    
    # Install packages (if not already)
    if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")
    BiocManager::install(c("PWMEnrich", "Biostrings", "PWMEnrich.Hsapiens.background", "MotifDb"))
    BiocManager::install("PWMEnrich.Hsapiens.background")
    
    # Load libraries
    library(PWMEnrich)
    library(Biostrings)
    library(PWMEnrich.Hsapiens.background)
    library(MotifDb)
    
    # ============================================
    # 3. Load the hg19 GC‑aware background model
    # ============================================
    data("PWMLogn.hg19.MotifDb.Hsap",
        package = "PWMEnrich.Hsapiens.background")
    bg_model <- PWMLogn.hg19.MotifDb.Hsap
    
    # ============================================
    # 4. Define your list of RBP gene symbols
    # ============================================
    
    # Specific RBP gene symbols
    rbp_genes <- c("A1CF", "A2BP1", "AC004381.6", "AC008073.5", "ACO1", "AKAP1", "AKAP17A", "AL662825.2", "AL662825.3", "AL662849.4", "AL844853.7", "ALKBH8", "ANKHD1", "ANKRD17", "APLF", "ASCC1", "BICC1", "BOLL", "BRAP", "BRUNOL4", "BRUNOL5", "BRUNOL6", "BX000357.2", "BX005143.2", "BX119957.7", "BX248507.1", "BX248518.2", "BX511012.1", "BX908728.4", "BX927220.1", "C14orf156", "C14orf21", "CARHSP1", "CELF1", "CELF2", "CELF3", "CIRBP", "CNBP", "CNOT4", "CPEB1", "CPEB2", "CPEB3", "CPEB4", "CPSF4", "CPSF4L", "CPSF6", "CPSF7", "CR388219.1", "CR388372.1", "CR753328.1", "CR759778.5", "CR759782.5", "CR847863.1", "CSDA", "CSDC2", "CSDE1", "CSTF2", "CSTF2T", "DAZ1", "DAZ2", "DAZ3", "DAZ4", "DAZAP1", "DAZL", "DDX41", "DDX43", "DDX53", "DHX57", "DHX8", "DNAJC17", "DPPA5", "DUS3L", "EIF2S1", "EIF3B", "EIF3G", "EIF4B", "EIF4H", "ELAVL1", "ELAVL2", "ELAVL3", "ELAVL4", "ENOX1", "ENOX2", "ENSG00000180771", "ENSG00000183403", "ENSG00000185728", "ENSG00000213250", "ENSG00000215042", "ENSG00000215492", "ENSG00000231942", "ENSG00000248163", "ENSG00000249536", "ENSG00000249644", "ENSG00000250177", "ERAL1", "ESRP1", "ESRP2", "EWSR1", "FMR1", "FUBP1", "FUBP3", "FUS", "Fusip1", "FXR1", "FXR2", "G3BP1", "G3BP2", "GRSF1", "HDLBP", "HELZ", "HNRNPA0", "HNRNPA1", "HNRNPA1L2", "HNRNPA2B1", "HNRNPA3", "HNRNPAB", "HNRNPC", "HNRNPCL1", "HNRNPD", "HNRNPF", "HNRNPH1", "HNRNPH2", "HNRNPH3", "hnRNPK", "HNRNPL", "hnRNPLL", "HNRNPM", "HNRNPR", "HNRPDL", "HTATSF1", "IGF2BP1", "IGF2BP2", "IGF2BP3", "KHDRBS1", "KHDRBS2", "KHDRBS3", "KHSRP", "KIAA0430", "KRR1", "LARP1", "LARP1B", "LARP4", "LARP4B", "LARP6", "LARP7", "LENG9", "LIN28A", "LIN28B", "MATR3", "MBNL1", "MBNL2", "MBNL3", "MDM2", "MEX3B", "MEX3C", "MEX3D", "MKI67IP", "MKRN1", "MKRN2", "MKRN3", "MRPS28", "MSI1", "MSI2", "MTHFSD", "MYEF2", "NCBP2", "NCBP2L", "NCL", "NEIL3", "NOL8", "NONO", "NOVA1", "NOVA2", "NPLOC4", "NUP153", "NUPL2", "PABPC1", "PABPC1L", "PABPC1L2A", "PABPC1L2B", "PABPC3", "PABPC4", "PABPC5", "PABPN1", "PABPN1L", "PAN3", "PARP12", "PCBP1", "PCBP2", "PCBP3", "PCBP4", "PDCD11", "PEG10", "PNMA3", "PNO1", "PNPT1", "POLDIP3", "POLR2G", "PPARGC1A", "PPARGC1B", "PPIE", "PPIL4", "PPP1R10", "PPRC1", "PRR3", "PSPC1", "PTBP1", "PTBP2", "PUF60", "PUM1", "PUM2", "QKI", "RALY", "RANBP2", "RAVER1", "RAVER2", "RBBP6", "RBCK1", "RBFOX2", "RBFOX3", "RBM10", "RBM11", "RBM12", "RBM12B", "RBM14-RBM4", "RBM15", "RBM15B", "RBM17", "RBM18", "RBM19", "RBM22", "RBM23", "RBM24", "RBM25", "RBM26", "RBM27", "RBM28", "RBM3", "RBM33", "RBM34", "RBM38", "RBM39", "RBM4", "RBM41", "RBM42", "RBM44", "RBM45", "RBM46", "RBM47", "RBM4B", "RBM5", "RBM6", "RBM7", "RBM8A", "RBMS1", "RBMS2", "RBMS3", "RBMX", "RBMX2", "RBMXL1", "RBMXL2", "RBMXL3", "RBMY1A1", "RBMY1B", "RBMY1D", "RBMY1E", "RBMY1F", "RBMY1J", "RBPMS", "RBPMS2", "RBP_Name", "RC3H1", "RC3H2", "RCAN2", "RDBP", "RDM1", "RNF113A", "RNF113B", "RNF31", "RNPC3", "RNPS1", "ROD1", "RP11-1286E23.4", "RRP7A", "RYBP", "SAFB", "SAFB2", "SAMD4A", "SAMD4B", "SART3", "SCAF4", "SCAF8", "SETD1A", "SETD1B", "SF1", "SF3B4", "SFPQ", "SHARPIN", "SLTM", "SNRNP35", "SNRNP70", "SNRPA", "SNRPB2", "SOLH", "SPEN", "SRBD1", "SREK1", "SRRT", "SRSF1", "SRSF11", "SRSF12", "SRSF2", "SRSF3", "SRSF4", "SRSF5", "SRSF6", "SRSF7", "SRSF9", "SSB", "STAR-PAP", "SUPT6H", "SYNCRIP", "TAB2", "TAB3", "TAF15", "TARDBP", "TDRD10", "TDRKH", "TEX13A", "THOC4", "TIA1", "TIAL1", "TMEM63A", "TMEM63B", "TNRC6A", "TNRC6B", "TNRC6C", "TOE1", "TRA2A", "TRA2B", "TRMT1", "TRMT2A", "TRNAU1AP", "TTC14", "U2AF1", "U2AF1L4", "U2AF2", "U2SURP", "UHMK1", "UNK", "UNKL", "UPF3B", "YAF2", "YB-1", "YBX2", "YTHDC1", "YTHDC2", "YTHDF1", "YTHDF2", "ZC3H10", "ZC3H13", "ZC3H15", "ZC3H18", "ZC3H3", "ZC3H4", "ZC3H6", "ZC3H7A", "ZC3H7B", "ZC3H8", "ZCCHC11", "ZCCHC13", "ZCCHC14", "ZCCHC17", "ZCCHC2", "ZCCHC3", "ZCCHC5", "ZCCHC6", "ZCCHC7", "ZCCHC8", "ZCCHC9", "ZCRB1", "ZFP36", "ZFP36L1", "ZFP36L2", "ZGPAT", "ZMAT5", "ZNF638", "ZRANB1", "ZRANB2", "ZRANB3", "ZRSR1", "ZRSR2")
    
    # ============================================
    # 5. Extract all PWMs from background & filter to RBPs
    # ============================================
    #all_pwms <- bg_model@pwms  #2287
    #rbp_pwms <- all_pwms[
    #  vapply(names(all_pwms),
    #         function(nm) any(nm %in% rbp_genes),
    #         logical(1))
    #]
    
    # 如果你想做模糊匹配(基因名作为子串出现),可以用:
    rbp_pwms <- all_pwms[grepl(paste(rbp_genes, collapse="|"), names(all_pwms), ignore.case=TRUE)]
    
    cat("Number of RBP PWMs selected:", length(rbp_pwms), "\n")
    cat("Example PWM names:\n", head(names(rbp_pwms), 10), "\n")
    
    # ============================================
    # 6. Create a custom background model with only RBP PWMs
    # ============================================
    custom_bg <- bg_model
    custom_bg@pwms <- rbp_pwms
    
    # ============================================
    # 7. Run motif enrichment
    # ============================================
    enrichment_result <- motifEnrichment(utr_3_down_clean, custom_bg)
    
    # ============================================
    # 8. Generate report & convert to data.frame
    # ============================================
    report_list <- groupReport(enrichment_result)
    report_df   <- as.data.frame(report_list)
    
    # ============================================
    # 9. Apply BH correction and filter significant hits
    # ============================================
    report_df$adj_pval <- p.adjust(report_df$p.value, method = "BH")
    significant_hits <- subset(report_df, adj_pval < 0.05)
    
    # ============================================
    # 10. View results
    # ============================================
    # Display motif name, raw p-value, adjusted p-value, and enrichment score
    colnames(significant_hits)[
    match(c("target","id","raw.score"), colnames(significant_hits))
    ] <- c("Gene", "Motif_ID", "EnrichmentScore")
    
    print(
    head(
        significant_hits[, c("Gene","Motif_ID","EnrichmentScore","p.value","adj_pval")],
        6
    )
    )
    
    p <- ggplot(significant_hits,
                aes(x = reorder(Motif_ID, adj_pval),
                    y = -log10(adj_pval))) +
    geom_col() +
    coord_flip() +
    labs(
        x = "Motif ID",
        y = expression(-log[10]("FDR‑adjusted p‑value")),
        title = "Significant RBP Motif Enrichment"
    ) +
    theme_minimal()
    ggsave(
    filename = "RBP_motif_enrichment.png",  # or any path you like
    plot     = p,
    width    = 8,     # inches
    height   = 6,     # inches
    dpi      = 300    # resolution
    )
    
  4. Perform similar RBP-mapping by myself

    # Load a set of RBP motifs from a database (e.g., RBPDB) RBPDB, ENCODE, JASPAR, or even UCL's RBP motifs.
    
    # Set path to your PWM directory
    pfm_dir <- "/media/jhuang/Elements/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/motif_analysis/PFMDir"
    
    # List all .pfm files
    pfm_files <- list.files(pfm_dir, pattern = "\\.pfm$", full.names = TRUE)
    
    # Create an empty list to store PWMs
    pfm_list <- list()
    
    # Loop through each file
    for (file in pfm_files) {
    # Read table
    pfm <- tryCatch({
        mat <- read.table(file, header = FALSE)
        mat <- as.matrix(mat)
    
        # Optionally add column names (assuming 4 columns for A, C, G, T)
        if (ncol(mat) == 4) {
        colnames(mat) <- c("A", "C", "G", "T")
        }
    
        mat
    }, error = function(e) {
        message("Failed to read: ", file)
        NULL
    })
    
    # Use the filename (without extension) as list name
    name <- tools::file_path_sans_ext(basename(file))
    pfm_list[[name]] <- pfm
    }
    
    # Check one example
    str(pfm_list[[1]])
    #saveRDS(pfm_list, file = "compiled_pfm_list.rds")
    
    #The error you're seeing (invalid PFM 'x': not an integer matrix) indicates that the PWM function expects the input matrix to represent raw position frequency matrices (PFMs), which are expected to be integer counts, but the matrices you have are likely log-odds matrices (floating point values), which represent the log-transformed scores instead of raw counts.
    
    # convert to PFMatrix
    pfm_named_list <- lapply(pfm_list, function(mat) {
    mat <- as.matrix(mat)
    if (!all(rownames(mat) %in% c("A", "C", "G", "T"))) {
        rownames(mat) <- c("A", "C", "G", "T")
    }
    PFMatrix(ID = "motif", name = "motif", profileMatrix = mat)
    })
    
    # Create an empty list to store results
    match_results_list <- list()
    
    # Iterate through each PFMatrix object in the list
    for (i in seq_along(pfm_named_list)) {
    # Get the current PFMatrix object
    current_pfm <- pfm_named_list[[i]]
    
    # Run matchMotifs for the current motif
    match_result <- tryCatch({
        #matchMotifs(pfm_obj, subject = utr_3_down, genome = NULL, p.cutoff = 1e-3)
        matchMotifs(current_pfm, subject = utr_3_down, genome = NULL, p.cutoff = 1e-3)
    }, error = function(e) {
        message("Error with motif ", names(pfm_named_list)[i], ": ", e$message)
        return(NULL)  # Return NULL if error occurs
    })
    
    # Store the result if no error
    if (!is.null(match_result)) {
        match_results_list[[names(pfm_named_list)[i]]] <- match_result
    }
    }
    
    # Check the results
    match_results_list
    
    # Extract motif match results for one motif
    motif_matches_1004_8676391 <- assay(match_results_list[["1004_8676391"]], "motifMatches")
    
    # Convert sparse matrix to dense matrix
    dense_matrix <- as.matrix(motif_matches_1004_8676391)
    
    # Optional: Add rownames for better visualization (if available)
    if (!is.null(names(utr_3_down))) {
    rownames(dense_matrix) <- names(utr_3_down)
    }
    
    # Convert logical matrix to numeric (1 for TRUE, 0 for FALSE)
    dense_matrix_num <- matrix(as.numeric(dense_matrix),
                            nrow = nrow(dense_matrix),
                            dimnames = dimnames(dense_matrix))
    
    # Now plot the heatmap
    png("motif_match_1004_8676391.png", width = 800, height = 1600)
    pheatmap(dense_matrix_num,
            cluster_rows = FALSE,
            cluster_cols = FALSE,
            color = c("grey", "steelblue"),
            legend_breaks = c(0, 1),
            legend_labels = c("No match", "Match"),
            main = "Motif Match: 1004_8676391")
    dev.off()
    
    #✅ Output
    #You will see a binary heatmap, where:
    #   * Blue cells = motif matched
    #   * White cells = no match
    #Let me know if you want to loop this over all motifs or save the plot to a file.
    
  5. Does hg19 vs hg38 Position Matter in Motif Enrichment?

    ✅ If you're doing motif enrichment analysis only (like PWMEnrich):
    
    Position DOES NOT matter.
    Here's why:
    Aspect  Explanation
    🔍 PWMEnrich Only scans sequences for motif matches based on nucleotide content (e.g., enrichment of AU-rich motifs in downregulated 3′UTRs).
    📏 Coordinates   Genomic coordinates (hg19 vs hg38) are not used in enrichment. Only the sequence itself matters.
    ⚖️ Statistical Test The enrichment is based on observed motif frequency vs background expectations (GC-aware or custom). No genome alignment is done.
    
    So:
    
        ✅ As long as your 3′ UTR sequences are correct and biologically relevant, using an hg19 background is safe — even if your gene models are from hg38.
    
    ❗️ When does genome version matter?
    
    If you're doing things like:
    Use case    Position-sensitive?
    🔬 Variant effect on motifs (e.g., SNPs in motifs)   ✅ Yes
    🧭 Mapping motifs to genome positions (e.g., ChIP-seq overlaps)  ✅ Yes
    📦 Loading sequences from coordinates (not FASTA)    ✅ Yes
    🧠 Expression + genomic track integration (e.g., motif + ATAC-seq)   ✅ Yes
    
    Then you must ensure genome versions match exactly.
    ✅ In Your Case: Motif Enrichment of Downregulated 3′UTRs
    
        If you are:
    
        Feeding real 3′UTR sequences (as DNAStringSet) for downregulated genes
    
        Using PWMEnrich purely for motif enrichment
    
    Then:
    
        ✅ hg19 vs hg38 does not matter
        ❗ Only sequence content matters
    
    Let me know if you want to later map enriched motifs back to genome positions — that’s when we’d switch to position-aware tools like FIMO, MOODS, or rtracklayer.
    
  6. Custom RBP-database preparation (failed!)

    # Example: Create a PWM for 2 fake motifs (replace with your actual PWMs)
    #motif1 <- matrix(c(0.2, 0.3, 0.3, 0.2,
    #                   0.1, 0.4, 0.4, 0.1,
    #                   0.25, 0.25, 0.25, 0.25,
    #                   0.3, 0.2, 0.3, 0.2), nrow = 4, byrow = TRUE,
    #                 dimnames = list(c("A", "C", "G", "T"), NULL))
    #motif2 <- matrix(c(0.3, 0.2, 0.2, 0.3,
    #                   0.4, 0.1, 0.4, 0.1,
    #                   0.25, 0.25, 0.25, 0.25,
    #                   0.2, 0.3, 0.2, 0.3), nrow = 4, byrow = TRUE,
    #                 dimnames = list(c("A", "C", "G", "T"), NULL))
    #rbp_pwms <- list(RBP1 = motif1, RBP2 = motif2)
    
    pwm_dir <- "/media/jhuang/Elements/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/motif_analysis/PWMDir"
    pwm_files <- list.files(pwm_dir, pattern = "\\.pwm$", full.names = TRUE)
    rbp_pwms <- list()
    for (file in pwm_files) {
    # Read table
    pwm <- tryCatch({
        mat <- read.table(file, header = FALSE)
        mat <- as.matrix(mat)
    
        # Optionally add column names (assuming 4 columns for A, C, G, T)
        if (ncol(mat) == 4) {
        colnames(mat) <- c("A", "C", "G", "T")
        }
    
        mat
    }, error = function(e) {
        message("Failed to read: ", file)
        NULL
    })
    
    # Use the filename (without extension) as list name
    name <- tools::file_path_sans_ext(basename(file))
    rbp_pwms[[name]] <- pwm
    }
    
    # -- filtering (ERROR) --
    pwm_list <- motifs(bg_model)
    # Example: filter motifs containing 'ELAVL' or 'HNRNP'
    #rbp_pwms <- pwm_list[grep("ELAVL|HNRNP|IGF2BP|RBFOX|FMR1|PUM", names(pwm_list))]
    rbp_enrichment <- motifEnrichment(utr_3_down, bg_model, pwmList = rbp_pwms)
    rbp_report <- groupReport(rbp_enrichment)
    
    # -- from MotifDb --
    library(MotifDb)
    human_motifs <- query(MotifDb, "Hsapiens")
    rbp_motifs <- query(human_motifs, "RBP")
    
    # ---- try Build your background model directly from these PWMs ----
    
    # 1. List all PWM files
    files_all <- list.files("~/Downloads/cisbp-rna/pwms_all_motifs", pattern="\\.txt$", full.names=TRUE)
    
    # 2. Filter out empty files
    file_sizes <- file.info(files_all)$size
    files     <- files_all[file_sizes > 0]
    cat("Reading", length(files), "non‑empty PWM files out of", length(files_all), "\n")
    
    # 3. Define the reader (as before)
    readCisbpPfm <- function(path) {
    df <- read.table(path, header=TRUE, stringsAsFactors=FALSE)
    mat_rna <- as.matrix(df[, c("A","C","G","U")])
    mat_dna <- t(mat_rna)
    rownames(mat_dna) <- c("A","C","G","T")
    sweep(mat_dna, 2, colSums(mat_dna), FUN="/")
    }
    
    # 4. Read only the non‐empty files
    cisbp_mats <- lapply(files, readCisbpPfm)
    names(cisbp_mats) <- tools::file_path_sans_ext(basename(files))
    
    # 5. (Optional) Check the first few
    head(names(cisbp_mats))
    length(cisbp_mats)  #193
    
    # 6. Build your background model directly from these PWMs:
    bg_utrs <- utr_3_background_clean    #readDNAStringSet("run2/3utr_background.fasta")
    #bg_utrs <- bg_utrs[ width(bg_utrs)>=6 & !grepl("[^ACGT]", as.character(bg_utrs)) ]
    
    custom_bg <- makeBackground(
    seqs    = bg_utrs,
    motifs = cisbp_mats,
    type    = "logn",
    organism = "hg19",
    verbose  = TRUE
    )
    
    custom_bg <- makeBackground(
    sequences = bg_utrs,       # your DNAStringSet of background UTRs
    motifs    = cisbp_mats,    # your named list of PWM matrices
    type      = "logn",        # log‑normal GC‑aware model
    organism  = NULL,          # disable built‑in genomes
    verbose   = TRUE
    )
    
    custom_bg <- makeBackground(
    cisbp_mats,        # 1) motifs
    organism = "hg19",
    type    = "logn",  # 2) background model type
    bg.seq  = bg_utrs, # 3) override organism by supplying your own sequences
    quick   = FALSE    # (optional) fit to all sequences for best accuracy
    )
    
    # 1. Pick a library size to re‑scale your PWMs into PFMs
    library_size <- 1000   # you can also try 1000
    
    # 2. Convert each normalized matrix into integer counts
    cisbp_pfms <- lapply(cisbp_mats, function(pmat) {
    counts <- round(pmat * library_size)
    # ensure no column sums to zero (add pseudocount 1 if needed)
    zero_cols <- which(colSums(counts) == 0)
    if (length(zero_cols) > 0) {
        counts[, zero_cols] <- 1
    }
    counts
    })
    
    # 3. Build a custom background (PFMs + your bg UTRs)
    custom_bg <- makeBackground(
    cisbp_pfms,      # 1) a list of integer PFMs
    type   = "logn", # 2) log‑normal GC‑aware model
    bg.seq = bg_utrs,# 3) your background DNAStringSet
    quick  = FALSE   # fit on all sequences
    )
    
    # … then continue with motifEnrichment() on your down‑regulated UTRs as before.
    
    # 5) Read & clean your down‑regulated 3′UTRs
    down_utrs <- readDNAStringSet("downregulated_utrs.fasta")
    down_utrs <- down_utrs[ width(down_utrs)>=6 & !grepl("[^ACGT]", as.character(down_utrs)) ]
    
    # 6) Run the enrichment
    res    <- motifEnrichment(down_utrs, custom_bg)
    report <- groupReport(res)
    df     <- as.data.frame(report)
    df$adj_pval <- p.adjust(df$p.value, method="BH")
    
    # 7) Subset significant
    sig <- subset(df, adj_pval < 0.05)
    print(sig[, c("motif.name","p.value","adj_pval","enrichmentScore")])
    

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum