gene_x 0 like s 16 view s
Tags: variation, pipeline, DNA-seq
Input data:
ln -s ../raw_data_2024/hCoV229E_Rluc_R1.fastq.gz hCoV229E_Rluc_R1.fastq.gz
ln -s ../raw_data_2024/hCoV229E_Rluc_R2.fastq.gz hCoV229E_Rluc_R2.fastq.gz
ln -s ../raw_data_2024/p10_DMSO_R1.fastq.gz p10_DMSO_R1.fastq.gz
ln -s ../raw_data_2024/p10_DMSO_R2.fastq.gz p10_DMSO_R2.fastq.gz
ln -s ../raw_data_2024/p10_K22_R1.fastq.gz p10_K22_R1.fastq.gz
ln -s ../raw_data_2024/p10_K22_R2.fastq.gz p10_K22_R2.fastq.gz
ln -s ../raw_data_2024/p10_K7523_R1.fastq.gz p10_K7523_R1.fastq.gz
ln -s ../raw_data_2024/p10_K7523_R2.fastq.gz p10_K7523_R2.fastq.gz
ln -s ../raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20606/p16_DMSO_S29_R1_001.fastq.gz p16_DMSO_R1.fastq.gz
ln -s ../raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20606/p16_DMSO_S29_R2_001.fastq.gz p16_DMSO_R2.fastq.gz
ln -s ../raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20607/p16_K22_S30_R1_001.fastq.gz p16_K22_R1.fastq.gz
ln -s ../raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20607/p16_K22_S30_R2_001.fastq.gz p16_K22_R2.fastq.gz
ln -s ../raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20608/p16_X7523_S31_R1_001.fastq.gz p16_X7523_R1.fastq.gz
ln -s ../raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20608/p16_X7523_S31_R2_001.fastq.gz p16_X7523_R2.fastq.gz
Call variant calling using snippy
ln -s ~/Tools/bacto/db/ .;
ln -s ~/Tools/bacto/envs/ .;
ln -s ~/Tools/bacto/local/ .;
cp ~/Tools/bacto/Snakefile .;
cp ~/Tools/bacto/bacto-0.1.json .;
cp ~/Tools/bacto/cluster.json .;
#download CU459141.gb from GenBank
mv ~/Downloads/sequence\(2\).gb db/PP810610.gb
#setting the following in bacto-0.1.json
"fastqc": false,
"taxonomic_classifier": false,
"assembly": true,
"typing_ariba": false,
"typing_mlst": true,
"pangenome": true,
"variants_calling": true,
"phylogeny_fasttree": true,
"phylogeny_raxml": true,
"recombination": false, (due to gubbins-error set false)
"genus": "Alphacoronavirus",
"kingdom": "Viruses",
"species": "Human coronavirus 229E",
"mykrobe": {
"species": "corona"
},
"reference": "db/PP810610.gb"
mamba activate /home/jhuang/miniconda3/envs/bengal3_ac3
(bengal3_ac3) /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
Summarize all SNPs and Indels from the snippy result directory.
#Output: snippy/summary_snps_indels.csv
# IMPORTANT_ADAPT the array isolates = ["AYE-S", "AYE-Q", "AYE-WT on Tig4", "AYE-craA on Tig4", "AYE-craA-1 on Cm200", "AYE-craA-2 on Cm200"]
python3 ~/Scripts/summarize_snippy_res.py snippy
cd snippy
#grep -v "None,,,,,,None,None" summary_snps_indels.csv > summary_snps_indels_.csv
Using spandx calling variants (almost the same results to the one from viral-ngs!)
mamba activate /home/jhuang/miniconda3/envs/spandx
mkdir ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/PP810610
cp PP810610.gb ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/PP810610/genes.gbk
vim ~/miniconda3/envs/spandx/share/snpeff-5.1-2/snpEff.config
/home/jhuang/miniconda3/envs/spandx/bin/snpEff build PP810610 #-d
~/Scripts/genbank2fasta.py PP810610.gb
mv PP810610.gb_converted.fna PP810610.fasta #rename "NC_001348.1 xxxxx" to "NC_001348" in the fasta-file
ln -s /home/jhuang/Tools/spandx/ spandx
(spandx) nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq" --ref PP810610.fasta --annotation --database PP810610 -resume
# Rerun SNP_matrix.sh due to the error ERROR_CHROMOSOME_NOT_FOUND in the variants annotation
cd Outputs/Master_vcf
(spandx) cp -r ../../snippy/hCoV229E_Rluc/reference .
(spandx) cp ../../spandx/bin/SNP_matrix.sh ./
#Note that ${variant_genome_path}=NC_001348 in the following command, but it was not used after command replacement.
#Adapt "snpEff eff -no-downstream -no-intergenic -ud 100 -formatEff -v ${variant_genome_path} out.vcf > out.annotated.vcf" to
"/home/jhuang/miniconda3/envs/bengal3_ac3/bin/snpEff eff -no-downstream -no-intergenic -ud 100 -formatEff -c reference/snpeff.config -dataDir . ref out.vcf > out.annotated.vcf" in SNP_matrix.sh
(spandx) bash SNP_matrix.sh PP810610 .
Calling inter-host variants by merging the results from snippy+spandx (Manually!)
# Inter-host variants(宿主间变异):一种病毒在两个人之间有不同的基因变异,这些变异可能与宿主的免疫反应、疾病表现或病毒传播的方式相关。
cp All_SNPs_indels_annotated.txt All_SNPs_indels_annotated_backup.txt
vim All_SNPs_indels_annotated.txt
#in the file ids: grep "$(echo -e '\t')353$(echo -e '\t')" All_SNPs_indels_annotated.txt >> All_SNPs_indels_annotated_.txt
#Replace \n with " All_SNPs_indels_annotated.txt >> All_SNPs_indels_annotated_.txt\ngrep "
#Replace grep " --> grep "$(echo -e '\t')
#Replace " All_ --> $(echo -e '\t')" All_
# Potential intra-host variants: 10871, 19289, 23435.
CHROM POS REF ALT TYPE hCoV229E_Rluc_trimmed p10_DMSO_trimmed p10_K22_trimmed p10_K7523_trimmed p16_DMSO_trimmed p16_K22_trimmed p16_X7523_trimmed Effect Impact Functional_Class Codon_change Protein_and_nucleotide_change Amino_Acid_Length Gene_name Biotype
PP810610 1464 T C SNP C C C C C C C missense_variant MODERATE MISSENSE gTt/gCt p.Val416Ala/c.1247T>C 6757 CDS_1 protein_coding
PP810610 1699 C T SNP T T T T T T T synonymous_variant LOW SILENT gtC/gtT p.Val494Val/c.1482C>T 6757 CDS_1 protein_coding
PP810610 6691 C T SNP T T T T T T T synonymous_variant LOW SILENT tgC/tgT p.Cys2158Cys/c.6474C>T 6757 CDS_1 protein_coding
PP810610 6919 C G SNP G G G G G G G synonymous_variant LOW SILENT ggC/ggG p.Gly2234Gly/c.6702C>G 6757 CDS_1 protein_coding
PP810610 7294 T A SNP A A A A A A A missense_variant MODERATE MISSENSE agT/agA p.Ser2359Arg/c.7077T>A 6757 CDS_1 protein_coding
* PP810610 10871 C T SNP C C/T T C/T C/T T C/T missense_variant MODERATE MISSENSE Ctt/Ttt p.Leu3552Phe/c.10654C>T 6757 CDS_1 protein_coding
PP810610 14472 T C SNP C C C C C C C missense_variant MODERATE MISSENSE aTg/aCg p.Met4752Thr/c.14255T>C 6757 CDS_1 protein_coding
PP810610 15458 T C SNP C C C C C C C synonymous_variant LOW SILENT Ttg/Ctg p.Leu5081Leu/c.15241T>C 6757 CDS_1 protein_coding
PP810610 16035 C A SNP A A A A A A A stop_gained HIGH NONSENSE tCa/tAa p.Ser5273*/c.15818C>A 6757 CDS_1 protein_coding
PP810610 17430 T C SNP C C C C C C C missense_variant MODERATE MISSENSE tTa/tCa p.Leu5738Ser/c.17213T>C 6757 CDS_1 protein_coding
* PP810610 19289 G T SNP G G T G G G/T G missense_variant MODERATE MISSENSE Gtt/Ttt p.Val6358Phe/c.19072G>T 6757 CDS_1 protein_coding
PP810610 21183 T G SNP G G G G G G G missense_variant MODERATE MISSENSE tTt/tGt p.Phe230Cys/c.689T>G 1173 CDS_2 protein_coding
PP810610 22636 T G SNP G G G G G G G missense_variant MODERATE MISSENSE aaT/aaG p.Asn714Lys/c.2142T>G 1173 CDS_2 protein_coding
PP810610 23022 T C SNP C C C C C C C missense_variant MODERATE MISSENSE tTa/tCa p.Leu843Ser/c.2528T>C 1173 CDS_2 protein_coding
* PP810610 23435 C T SNP C C T C/T C C/T C/T missense_variant MODERATE MISSENSE Ctt/Ttt p.Leu981Phe/c.2941C>T 1173 CDS_2 protein_coding
PP810610 24512 C T SNP T T T T T T T missense_variant MODERATE MISSENSE Ctc/Ttc p.Leu36Phe/c.106C>T 88 CDS_4 protein_coding
PP810610 24781 C T SNP T T T T T T T missense_variant MODERATE MISSENSE aCt/aTt p.Thr36Ile/c.107C>T 77 CDS_5 protein_coding
PP810610 25163 C T SNP T T T T T T T missense_variant MODERATE MISSENSE Ctt/Ttt p.Leu82Phe/c.244C>T 225 CDS_6 protein_coding
PP810610 25264 C T SNP T T T T T T T synonymous_variant LOW SILENT gtC/gtT p.Val115Val/c.345C>T 225 CDS_6 protein_coding
PP810610 26838 G T SNP T T T T T T T
Calling intra-host variants using viral-ngs
# Intra-host variants(宿主内变异):同一个人感染了某种病毒,但在其体内的不同细胞或器官中可能存在多个不同的病毒变异株。
#How to run and debug the viral-ngs docker?
# ---- DEBUG_2025_1: using docker instead ----
mkdir viralngs; cd viralngs
ln -s ~/Tools/viral-ngs_docker/Snakefile Snakefile
ln -s ~/Tools/viral-ngs_docker/bin bin
cp ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/refsel.acids refsel.acids
cp ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/lastal.acids lastal.acids
cp ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/config.yaml config.yaml
cp ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/samples-runs.txt samples-runs.txt
cp ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/samples-depletion.txt samples-depletion.txt
cp ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/samples-metagenomics.txt samples-metagenomics.txt
cp ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/samples-assembly.txt samples-assembly.txt
cp ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/samples-assembly-failures.txt samples-assembly-failures.txt
# Adapt the sample-*.txt
mkdir viralngs/data
mkdir viralngs/data/00_raw
mkdir bams
ref_fa="PP810610.fasta";
#for sample in hCoV229E_Rluc p10_DMSO p10_K22; do
for sample in p10_K7523 p16_DMSO p16_K22 p16_X7523; do
bwa index ${ref_fa}; \
bwa mem -M -t 16 ${ref_fa} trimmed/${sample}_trimmed_P_1.fastq trimmed/${sample}_trimmed_P_2.fastq | samtools view -bS - > bams/${sample}_genome_alignment.bam; \
done
conda activate viral-ngs4
#for sample in hCoV229E_Rluc p10_DMSO p10_K22; do
#for sample in p10_K7523 p16_DMSO p16_K22 p16_X7523; do
for sample in p16_K22; do
picard AddOrReplaceReadGroups I=bams/${sample}_genome_alignment.bam O=~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2025/viralngs/data/00_raw/${sample}.bam SORT_ORDER=coordinate CREATE_INDEX=true RGPL=illumina RGID=$sample RGSM=$sample RGLB=standard RGPU=$sample VALIDATION_STRINGENCY=LENIENT; \
done
conda deactivate
# -- ! Firstly set the samples-assembly.txt empty, so that only focus on running depletion!
docker run -it -v /mnt/md1/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2025/viralngs:/work -v /home/jhuang/Tools/viral-ngs_docker:/home/jhuang/Tools/viral-ngs_docker -v /home/jhuang/REFs:/home/jhuang/REFs -v /home/jhuang/Tools/GenomeAnalysisTK-3.6:/home/jhuang/Tools/GenomeAnalysisTK-3.6 -v /home/jhuang/Tools/novocraft_v3:/home/jhuang/Tools/novocraft_v3 -v /usr/local/bin/gatk:/usr/local/bin/gatk own_viral_ngs bash
cd /work
snakemake --directory /work --printshellcmds --cores 40
# -- ! Secondly manully run assembly steps
# --> By itereative add the unfinished assembly in the list, each time replace one, and run "snakemake --directory /work --printshellcmds --cores 40"
# # ---- NOTE that the following steps need rerun --> DOES NOT WORK, USE STRATEGY ABOVE ----
# #for sample in p10_K22 p10_K7523; do
# for sample in hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523 p16_DMSO p16_K22 p16_X7523; do
# bin/read_utils.py merge_bams data/01_cleaned/${sample}.cleaned.bam tmp/01_cleaned/${sample}.cleaned.bam --picardOptions SORT_ORDER=queryname
# bin/read_utils.py rmdup_mvicuna_bam tmp/01_cleaned/${sample}.cleaned.bam data/01_per_sample/${sample}.cleaned.bam --JVMmemory 30g
# done
#
# #Note that the error generated by nextflow is from the step gapfill_gap2seq!
# for sample in hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523 p16_DMSO p16_K22 p16_X7523; do
# bin/assembly.py assemble_spades data/01_per_sample/${sample}.taxfilt.bam /home/jhuang/REFs/viral_ngs_dbs/trim_clip/contaminants.fasta tmp/02_assembly/${sample}.assembly1-spades.fasta --nReads 10000000 --threads 15 --memLimitGb 12
# done
# for sample in hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523 p16_DMSO p16_K22 p16_X7523; do
# for sample in p10_K22 p10_K7523; do
# bin/assembly.py order_and_orient tmp/02_assembly/${sample}.assembly1-spades.fasta refsel_db/refsel.fasta tmp/02_assembly/${sample}.assembly2-scaffolded.fasta --min_pct_contig_aligned 0.05 --outAlternateContigs tmp/02_assembly/${sample}.assembly2-alternate_sequences.fasta --nGenomeSegments 1 --outReference tmp/02_assembly/${sample}.assembly2-scaffold_ref.fasta --threads 15
# done
#
# for sample in hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523 p16_DMSO p16_K22 p16_X7523; do
# bin/assembly.py gapfill_gap2seq tmp/02_assembly/${sample}.assembly2-scaffolded.fasta data/01_per_sample/${sample}.cleaned.bam tmp/02_assembly/${sample}.assembly2-gapfilled.fasta --memLimitGb 12 --maskErrors --randomSeed 0 --loglevel DEBUG
# done
#IMPORTANT: Reun the following commands!
for sample in hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523 p16_DMSO p16_K22 p16_X7523; do
bin/assembly.py impute_from_reference tmp/02_assembly/${sample}.assembly2-gapfilled.fasta tmp/02_assembly/${sample}.assembly2-scaffold_ref.fasta tmp/02_assembly/${sample}.assembly3-modify.fasta --newName ${sample} --replaceLength 55 --minLengthFraction 0.05 --minUnambig 0.05 --index --loglevel DEBUG
done
# for sample in hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523 p16_DMSO p16_K22 p16_X7523; do
# bin/assembly.py refine_assembly tmp/02_assembly/${sample}.assembly3-modify.fasta data/01_per_sample/${sample}.cleaned.bam tmp/02_assembly/${sample}.assembly4-refined.fasta --outVcf tmp/02_assembly/${sample}.assembly3.vcf.gz --min_coverage 2 --novo_params '-r Random -l 20 -g 40 -x 20 -t 502' --threads 15 --loglevel DEBUG
# bin/assembly.py refine_assembly tmp/02_assembly/${sample}.assembly4-refined.fasta data/01_per_sample/${sample}.cleaned.bam data/02_assembly/${sample}.fasta --outVcf tmp/02_assembly/${sample}.assembly4.vcf.gz --min_coverage 3 --novo_params '-r Random -l 20 -g 40 -x 20 -t 100' --threads 15 --loglevel DEBUG
# done
# -- ! Thirdly set the samples-assembly.txt completely and run "snakemake --directory /work --printshellcmds --cores 40"
# ---------------------------- BUG list of the docker pipeline, mostly are due to the version incompability ----------------------------
#BUG_1: FileNotFoundError: [Errno 2] No such file or directory: '/home/jhuang/Tools/samtools-1.9/samtools': '/home/jhuang/Tools/samtools-1.9/samtools'
#DEBUG_1 (DEPRECATED):
# - In docker install independent samtools
conda create -n samtools-1.9-env samtools=1.9 -c bioconda -c conda-forge
# - persistence the modified docker, next time run own docker image
docker ps
#CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
#881a1ad6a990 quay.io/broadinstitute/viral-ngs "bash" 8 minutes ago Up 8 minutes intelligent_yalow
docker commit 881a1ad6a990 own_viral_ngs
docker image ls
docker run -it own_viral_ngs bash
#Change the path as "/opt/miniconda/envs/samtools-1.9-env/bin/samtools" in /work/bin/tools/samtools.py
# If another tool expect for samtools could not be installed, also use the same method above to install it on own_viral_ngs!
#DEBUG_1_BETTER_SIMPLE: TOOL_VERSION = '1.6' --> '1.9' in ~/Tools/viral-ngs_docker/bin/tools/samtools.py
#BUG_2:
bin/taxon_filter.py deplete data/00_raw/2040_04.bam tmp/01_cleaned/2040_04.raw.bam tmp/01_cleaned/2040_04.bmtagger_depleted.bam tmp/01_cleaned/2040_04.rmdup.bam data/01_cleaned/2040_04.cleaned.bam --bmtaggerDbs /home/jhuang/REFs/viral_ngs_dbs/bmtagger_dbs_remove/hg19 /home/jhuang/REFs/viral_ngs_dbs/bmtagger_dbs_remove/metagenomics_contaminants_v3 /home/jhuang/REFs/viral_ngs_dbs/bmtagger_dbs_remove/GRCh37.68_ncRNA-GRCh37.68_transcripts-HS_rRNA_mitRNA --blastDbs /home/jhuang/REFs/viral_ngs_dbs/blast_dbs_remove/hybsel_probe_adapters /home/jhuang/REFs/viral_ngs_dbs/blast_dbs_remove/metag_v3.ncRNA.mRNA.mitRNA.consensus --threads 15 --srprismMemory 14250 --JVMmemory 50g --loglevel DEBUG
#2025-05-23 09:58:45,326 - __init__:445:_attempt_install - DEBUG - Currently installed version of blast: 2.7.1-h4422958_6
#2025-05-23 09:58:45,327 - __init__:448:_attempt_install - DEBUG - Expected version of blast: 2.6.0
#2025-05-23 09:58:45,327 - __init__:449:_attempt_install - DEBUG - Incorrect version of blast installed. Removing it...
#DEBUG_2: TOOL_VERSION = "2.6.0" --> "2.7.1" in ~/Tools/viral-ngs_docker/bin/tools/blast.py
#BUG_3:
bin/read_utils.py bwamem_idxstats data/01_cleaned/1762_04.cleaned.bam /home/jhuang/REFs/viral_ngs_dbs/spikeins/ercc_spike-ins.fasta --outStats reports/spike_count/1762_04.spike_count.txt --minScoreToFilter 60 --loglevel DEBUG
#DEBUG_3: TOOL_VERSION = "0.7.15" --> "0.7.17" in ~/Tools/viral-ngs_docker/bin/tools/bwa.py
#BUG_4: FileNotFoundError: [Errno 2] No such file or directory: '/usr/local/bin/trimmomatic': '/usr/local/bin/trimmomatic'
#DEBUG_4: TOOL_VERSION = "0.36" --> "0.38" in ~/Tools/viral-ngs_docker/bin/tools/trimmomatic.py
#BUG_5: FileNotFoundError: [Errno 2] No such file or directory: '/usr/bin/spades.py': '/usr/bin/spades.py'
#DEBUG_5: TOOL_VERSION = "0.36" --> "0.38" in ~/Tools/viral-ngs_docker/bin/tools/trimmomatic.py
# def install_and_get_path(self):
# # the conda version wraps the jar file with a shell script
# return 'trimmomatic'
#BUG_6: bin/assembly.py order_and_orient tmp/02_assembly/2039_04.assembly1-spades.fasta refsel_db/refsel.fasta tmp/02_assembly/2039_04.assembly2-scaffolded.fasta --min_pct_contig_aligned 0.05 --outAlternateContigs tmp/02_assembly/2039_04.assembly2-alternate_sequences.fasta --nGenomeSegments 1 --outReference tmp/02_assembly/2039_04.assembly2-scaffold_ref.fasta --threads 15 --loglevel DEBUG
2025-05-23 17:40:19,526 - __init__:445:_attempt_install - DEBUG - Currently installed version of mummer4: 4.0.0beta2-pl526hf484d3e_4
2025-05-23 17:40:19,527 - __init__:448:_attempt_install - DEBUG - Expected version of mummer4: 4.0.0rc1
2025-05-23 17:40:19,527 - __init__:449:_attempt_install - DEBUG - Incorrect version of mummer4 installed. Removing it..
DEBUG_6: TOOL_VERSION = "4.0.0rc1" --> "4.0.0beta2" in ~/Tools/viral-ngs_docker/bin/tools/mummer.py
#BUG_7: bin/assembly.py order_and_orient tmp/02_assembly/2039_04.assembly1-spades.fasta refsel_db/refsel.fasta tmp/02_assembly/2039_04.assembly2-scaffolded.fasta --min_pct_contig_aligned 0.05 --outAlternateContigs tmp/02_assembly/2039_04.assembly2-alternate_sequences.fasta --nGenomeSegments 1 --outReference tmp/02_assembly/2039_04.assembly2-scaffold_ref.fasta --threads 15 --loglevel DEBUG
File "bin/assembly.py", line 549, in <listcomp>
base_counts = [sum([len(seg.seq.replace("N", "")) for seg in scaffold]) \
AttributeError: 'Seq' object has no attribute 'replace'
DEBUG_7: base_counts = [sum([len(seg.seq.replace("N", "")) for seg in scaffold]) --> base_counts = [sum([len(seg.seq.ungap('N')) for seg in scaffold]) in ~/Tools/viral-ngs_docker/bin/assembly.py
BUG_8:
bin/assembly.py refine_assembly tmp/02_assembly/1243_2.assembly3-modify.fasta data/01_per_sample/1243_2.cleaned.bam tmp/02_assembly/1243_2.assembly4-refined.fasta --outVcf tmp/02_assembly/1243_2.assembly3.vcf.gz --min_coverage 2 --novo_params '-r Random -l 20 -g 40 -x 20 -t 502' --threads 15 --loglevel DEBUG
File "/work/bin/tools/gatk.py", line 75, in execute
FileNotFoundError: [Errno 2] No such file or directory: '/usr/local/bin/gatk': '/usr/local/bin/gatk'
#DEBUG_8: -v /usr/local/bin/gatk:/usr/local/bin/gatk in 'docker run' and change default python in the script via a shebang; TOOL_VERSION = "3.8" --> "3.6" in ~/Tools/viral-ngs_docker/bin/tools/gatk.py
BUG_9:
pyyaml is missing!
#DEBUG_9: NO_ERROR if rerun!
bin/assembly.py impute_from_reference tmp/02_assembly/2039_04.assembly2-gapfilled.fasta tmp/02_assembly/2039_04.assembly2-scaffold_ref.fasta tmp/02_assembly/2039_04.assembly3-modify.fasta --newName 2039_04 --replaceLength 55 --minLengthFraction 0.05 --minUnambig 0.05 --index --loglevel DEBUG
for sample in 2039_04 2040_04; do
for sample in 1762_04 1243_2 875_04; do
bin/assembly.py impute_from_reference tmp/02_assembly/${sample}.assembly2-gapfilled.fasta tmp/02_assembly/${sample}.assembly2-scaffold_ref.fasta tmp/02_assembly/${sample}.assembly3-modify.fasta --newName ${sample} --replaceLength 55 --minLengthFraction 0.05 --minUnambig 0.05 --index --loglevel DEBUG
done
#BUG_10: bin/reports.py consolidate_fastqc reports/fastqc/2039_04/align_to_self reports/fastqc/2040_04/align_to_self reports/fastqc/1762_04/align_to_self reports/fastqc/1243_2/align_to_self reports/fastqc/875_04/align_to_self reports/summary.fastqc.align_to_self.txt
#DEBUG_10: File "bin/intrahost.py", line 527 and line 579 in merge_to_vcf
# #MODIFIED_BACK
samp_to_seqIndex[sampleName] = seq.seq.ungap('-')
#samp_to_seqIndex[sampleName] = seq.seq.replace("-", "")
#BUG_11: bin/interhost.py multichr_mafft ref_genome/reference.fasta data/02_assembly/2039_04.fasta data/02_assembly/2040_04.fasta data/02_assembly/1762_04.fasta data/02_assembly/1243_2.fasta data/02_assembly/875_04.fasta data/03_multialign_to_ref --ep 0.123 --maxiters 1000 --preservecase --localpair --outFilePrefix aligned --sampleNameListFile data/03_multialign_to_ref/sampleNameList.txt --threads 15 --loglevel DEBUG
2025-05-26 15:04:19,014 - cmd:195:main_argparse - INFO - command: bin/interhost.py multichr_mafft inFastas=['ref_genome/reference.fasta', 'data/02_assembly/2039_04.fasta', 'data/02_assembly/2040_04.fasta', 'data/02_assembly/1762_04.fasta', 'data/02_assembly/1243_2.fasta', 'data/02_assembly/875_04.fasta'] localpair=True globalpair=None preservecase=True reorder=None gapOpeningPenalty=1.53 ep=0.123 verbose=False outputAsClustal=None maxiters=1000 outDirectory=data/03_multialign_to_ref outFilePrefix=aligned sampleRelationFile=None sampleNameListFile=data/03_multialign_to_ref/sampleNameList.txt threads=15 loglevel=DEBUG tmp_dir=/tmp tmp_dirKeep=False
2025-05-26 15:04:19,014 - cmd:209:main_argparse - DEBUG - using tempDir: /tmp/tmp-interhost-multichr_mafft-nuws9mhp
2025-05-26 15:04:21,085 - __init__:445:_attempt_install - DEBUG - Currently installed version of mafft: 7.402-0
2025-05-26 15:04:21,085 - __init__:448:_attempt_install - DEBUG - Expected version of mafft: 7.221
2025-05-26 15:04:21,085 - __init__:449:_attempt_install - DEBUG - Incorrect version of mafft installed. Removing it...
#DEBUG_11: TOOL_VERSION = "7.221" --> "7.402" in ~/Tools/viral-ngs_docker/bin/tools/mafft.py
#BUG_12: bin/interhost.py snpEff data/04_intrahost/isnvs.vcf.gz PP810610.1 data/04_intrahost/isnvs.annot.vcf.gz j.huang@uke.de --loglevel DEBUG
2025-06-10 13:14:07,526 - __init__:445:_attempt_install - DEBUG - Currently installed version of snpeff: 4.3.1t-3
2025-06-10 13:14:07,527 - __init__:448:_attempt_install - DEBUG - Expected version of snpeff: 4.1l
#DEBUG_12: -v /usr/local/bin/gatk:/usr/local/bin/gatk in 'docker run' and change default python in the script via a shebang; TOOL_VERSION = "4.1l" --> "4.3.1t" in ~/Tools/viral-ngs_docker/bin/tools/snpeff.py
Comparing intra- and inter-host variants, comparing the variants to the alignments of the assemblies to confirm its correctness.
From the step 5, only 5 inter-host variants were confirmed: they are 10871, 19289, 23435.
PP810610 10871 hCoV229E_Rluc hCoV229E_Rluc C,T 0.0057070386810399495 0.011348936781066188 1.0 missense_variant 10654C>T Leu3552Phe 3552 6758 Gene_217_20492 XBA84229.1
PP810610 10871 p10_DMSO p10_DMSO C,T 0.0577716643741403 0.10886819833916395 1.0 missense_variant 10654C>T Leu3552Phe 3552 6758 Gene_217_20492 XBA84229.1
PP810610 10871 p10_K22 p10_K22 C,T 1.0 0.0 1.0 missense_variant 10654C>T Leu3552Phe 3552 6758 Gene_217_20492 XBA84229.1
PP810610 10871 p10_K7523 p10_K7523 C,T 0.8228321896444167 0.2915587546587828 1.0 missense_variant 10654C>T Leu3552Phe 3552 6758 Gene_217_20492 XBA84229.1
PP810610 10871 p16_DMSO p16_DMSO C,T 0.02927088877062267 0.05682820768240093 1.0 missense_variant 10654C>T Leu3552Phe 3552 6758 Gene_217_20492 XBA84229.1
PP810610 10871 p16_K22 p16_K22 C,T 0.9911209766925638 0.017600372505084394 1.0 missense_variant 10654C>T Leu3552Phe 3552 6758 Gene_217_20492 XBA84229.1
PP810610 10871 p16_X7523 p16_X7523 C,T 0.8776699029126214 0.21473088886794223 1.0 missense_variant 10654C>T Leu3552Phe 3552 6758 Gene_217_20492 XBA84229.1
PP810610 19289 hCoV229E_Rluc hCoV229E_Rluc G,T 0.0 0.0 1.0 missense_variant 19073G>T Gly6358Val 6358 6758 Gene_217_20492 XBA84229.1
PP810610 19289 p10_DMSO p10_DMSO G,T 0.0 0.0 1.0 missense_variant 19073G>T Gly6358Val 6358 6758 Gene_217_20492 XBA84229.1
PP810610 19289 p10_K22 p10_K22 G,T 1.0 0.0 1.0 missense_variant 19073G>T Gly6358Val 6358 6758 Gene_217_20492 XBA84229.1
PP810610 19289 p10_K7523 p10_K7523 G,T 0.0 0.0 1.0 missense_variant 19073G>T Gly6358Val 6358 6758 Gene_217_20492 XBA84229.1
PP810610 19289 p16_DMSO p16_DMSO G,T 0.0 0.0 1.0 missense_variant 19073G>T Gly6358Val 6358 6758 Gene_217_20492 XBA84229.1
PP810610 19289 p16_K22 p16_K22 G,T 0.9884823848238482 0.02276991943361173 1.0 missense_variant 19073G>T Gly6358Val 6358 6758 Gene_217_20492 XBA84229.1
PP810610 19289 p16_X7523 p16_X7523 G,T 0.0 0.0 1.0 missense_variant 19073G>T Gly6358Val 6358 6758 Gene_217_20492 XBA84229.1
PP810610 23435 hCoV229E_Rluc hCoV229E_Rluc C,T 0.0 0.0 1.0 missense_variant 2941C>T Leu981Phe 981 1173 Gene_20494_24015 XBA84230.1
PP810610 23435 p10_DMSO p10_DMSO C,T 0.031912415560214305 0.061788026586653055 1.0 missense_variant 2941C>T Leu981Phe 981 1173 Gene_20494_24015 XBA84230.1
PP810610 23435 p10_K22 p10_K22 C,T 1.0 0.0 1.0 missense_variant 2941C>T Leu981Phe 981 1173 Gene_20494_24015 XBA84230.1
PP810610 23435 p10_K7523 p10_K7523 C,T 0.8352090032154341 0.27526984832663026 1.0 missense_variant 2941C>T Leu981Phe 981 1173 Gene_20494_24015 XBA84230.1
PP810610 23435 p16_DMSO p16_DMSO C,T 0.0 0.0 1.0 missense_variant 2941C>T Leu981Phe 981 1173 Gene_20494_24015 XBA84230.1
PP810610 23435 p16_K22 p16_K22 C,T 0.958498023715415 0.07955912449811753 1.0 missense_variant 2941C>T Leu981Phe 981 1173 Gene_20494_24015 XBA84230.1
PP810610 23435 p16_X7523 p16_X7523 C,T 0.13175164058556285 0.22878629157715102 1.0 missense_variant 2941C>T Leu981Phe 981 1173 Gene_20494_24015 XBA84230.1
Generate variant_annot.xls and coverages.xls
sudo chown -R jhuang:jhuang data
# -- generate isnvs_annot_complete__.txt, isnvs_annot_0.05.txt from ~/DATA/Data_Pietschmann_RSV_Probe3/data/04_intrahost
cp isnvs.annot.txt isnvs.annot_complete.txt
~/Tools/csv2xls-0.4/csv_to_xls.py isnvs.annot_complete.txt -d$'\t' -o isnvs.annot_complete.xls
#delete the columns patient, time, Hw and Hs and the header in the xls and save as txt file.
awk '{printf "%.3f\n", $5}' isnvs.annot_complete.csv > f5
cut -f1-4 isnvs.annot_complete.csv > f1_4
cut -f6- isnvs.annot_complete.csv > f6_
paste f1_4 f5 > f1_5
paste f1_5 f6_ > isnvs_annot_complete_.txt
#correct f5 in header of isnvs_annot_complete_.txt to iSNV_freq
#header: chr pos sample alleles iSNV_freq eff_type eff_codon_dna eff_aa eff_aa_pos eff_prot_len eff_gene eff_protein
~/Tools/csv2xls-0.4/csv_to_xls.py isnvs_annot_complete_.txt -d$'\t' -o variant_annot.xls
#MANUALLY generate variant_annot_0.01.csv variant_annot_0.05.csv
awk ' $5 >= 0.05 ' isnvs_annot_complete_.txt > 0.05.csv
cut -f2 0.05.csv
awk ' $5 >= 0.01 ' isnvs_annot_complete_.txt > 0.01.csv
cut -f2 0.05.csv | uniq > ids_0.05
cut -f2 0.01.csv | uniq > ids_0.01
#Replace '\n' with '\\t" isnvs_annot_complete_.txt >> isnvs_annot_0.05.txt\ngrep -P "PP810610\\t' in ids_0.05 and then deleting the 'pos' line
#Replace '\n' with '\\t" isnvs_annot_complete_.txt >> isnvs_annot_0.01.txt\ngrep -P "PP810610\\t' in ids_0.01 and then deleting the 'pos' line
#Run ids_0.05 and ids_0.01
cp ../../Outputs/Master_vcf/All_SNPs_indels_annotated.txt ../../Outputs/Master_vcf/All_SNPs_indels_annotated.txt hCoV229E_Rluc_variants
# Delete the three records which already reported in intra-host results hCoV229E_Rluc_variants: they are 10871, 19289, 23435.
PP810610 10871 C T SNP C C/T T C/T C/T T C/T missense_variant MODERATE MISSENSE Ctt/Ttt p.Leu3552Phe/c.10654C>T 6757 CDS_1 protein_coding
PP810610 19289 G T SNP G G T G G G/T G missense_variant MODERATE MISSENSE Gtt/Ttt p.Val6358Phe/c.19072G>T 6757 CDS_1 protein_coding
PP810610 23435 C T SNP C C T C/T C C/T C/T missense_variant MODERATE MISSENSE Ctt/Ttt p.Leu981Phe/c.2941C>T 1173 CDS_2 protein_coding
~/Tools/csv2xls-0.4/csv_to_xls.py isnvs_annot_0.05.txt isnvs_annot_0.01.txt hCoV229E_Rluc_variants -d$'\t' -o variant_annot.xls
#Modify sheetname to variant_annot_0.05 and variant_annot_0.01 and add the header in Excel file.
#Note in the complete list, Set 2024 is NOT a subset of Set 2025 because the element 26283 is in set 2024 but missing from set 2025.
# -- calculate the coverage
samtools depth ./data/02_align_to_self/hCoV229E_Rluc.mapped.bam > hCoV229E_Rluc_cov.txt
samtools depth ./data/02_align_to_self/p10_DMSO.mapped.bam > p10_DMSO_cov.txt
samtools depth ./data/02_align_to_self/p10_K22.mapped.bam > p10_K22_cov.txt
samtools depth ./data/02_align_to_self/p10_K7523.mapped.bam > p10_K7523_cov.txt
~/Tools/csv2xls-0.4/csv_to_xls.py hCoV229E_Rluc_cov.txt p10_DMSO_cov.txt p10_K22_cov.txt p10_K7523_cov.txt -d$'\t' -o coverages.xls
(Optional) Consensus sequences of each and of all isolates
cat PP810610.1.fa OZ035258.1.fa MZ712010.1.fa OK662398.1.fa OK625404.1.fa KF293664.1.fa NC_002645.1.fa > all.fa
cp data/02_assembly/*.fasta ./
for sample in hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523; do \
mv ${sample}.fasta ${sample}.fa
cat all.fa ${sample}.fa >> all.fa
done
cat RSV_dedup.fa all.fa > RSV_all.fa
mafft --clustalout --adjustdirection RSV_all.fa > RSV_all.aln
snp-sites RSV_all.aln -o RSV_all_.aln
Report
Please find attached the variant analysis results for Thomas. Variant frequencies in the new samples are highlighted in yellow.
Although PP810610 is used as the reference, only differences observed in the samples p10_DMSO, p10_K22, p10_K7523, p16_DMSO, p16_K22, and p16_X7523 compared to hCoV229E_Rluc are reported in the sheets variant_annot_0.05 and variant_annot_0.01 (see variant_annot.xls). Variants already present in hCoV229E_Rluc are excluded from these sheets. In total, 17 mutations were found in hCoV229E_Rluc relative to PP810610, detailed in the sheet “hCoV229E_Rluc_variants” (see variant_annot.xls).
------ Explanation of iSNV_freq in the sheets variant_annot_0.05 and variant_annot_0.01 ------
The iSNV_freq column shows the frequency of the second allele at each position. For example, at position 23435 on chr PP810610:
chr Position Sample Alleles iSNV_freq
PP810610 23435 hCoV229E_Rluc C,T 0
PP810610 23435 p10_DMSO C,T 0.032
PP810610 23435 p10_K22 C,T 0.995
PP810610 23435 p10_K7523 C,T 0.835
PP810610 23435 p16_DMSO C,T 0
PP810610 23435 p16_K22 C,T 0.958
PP810610 23435 p16_X7523 C,T 0.132
The second allele (T) frequencies are:
0 (only C)
0.032 (3.2% T)
0.995 (99.5% T)
0.835 (83.5% T)
0 (only C)
0.958 (95.8% T)
0.132 (13.2% T)
点赞本文的读者
还没有人对此文章表态
没有评论
Data_Tam_DNAseq_2025 NACHREICHEN_2
DNAseq 2025_AYE and DNAseq 2025_adeABadeIJ_adeIJK_CM1_CM2
Structural Variant Calling for Nanopore Sequencing (edited)
Call and merge SNP and Indel results from using two variant calling methods
© 2023 XGenes.com Impressum