Revision 410d9e3d
Added by Chloé QUIGNOT 11 months ago
Snakefile | ||
---|---|---|
samples=["P10415", "P01308"]
|
||
|
||
rule targets:
|
||
input:
|
||
expand("fasta/{sample}.fasta", sample=samples),
|
||
"fusionFasta/allSequences.fasta",
|
||
"mafft/mafft_res.fasta",
|
||
|
||
|
||
rule loadData:
|
||
output:
|
||
"fasta/{sample}.fasta",
|
||
log:
|
||
stdout = "logs/{sample}_wget.stdout",
|
||
stderr = "logs/{sample}_wget.stderr",
|
||
params:
|
||
dirFasta = "fasta",
|
||
threads: 1
|
||
shell:
|
||
"""
|
||
wget --output-file {log.stderr} \
|
||
--directory-prefix {params.dirFasta} \
|
||
https://www.uniprot.org/uniprot/{wildcards.sample}.fasta > {log.stdout}
|
||
"""
|
||
|
||
|
||
rule fusionFasta:
|
||
input:
|
||
expand("fasta/{sample}.fasta", sample=samples),
|
||
output:
|
||
"fusionFasta/allSequences.fasta",
|
||
log:
|
||
"logs/fusionData.stderr",
|
||
threads: 1
|
||
shell:
|
||
"""
|
||
cat {input} > {output} 2> {log}
|
||
"""
|
||
|
||
|
||
rule mafft:
|
||
input:
|
||
rules.fusionFasta.output,
|
||
output:
|
||
"mafft/mafft_res.fasta",
|
||
log:
|
||
"logs/whichMafft.txt",
|
||
threads: 1
|
||
shell:
|
||
"""
|
||
mafft {input} > {output} 2> {log}
|
||
"""
|
demo_advanced/codes/Snakefile | ||
---|---|---|
wildcard_constraints:
|
||
sample="[a-zA-Z0-9_]+", # matches Unicode word characters
|
||
colExtra="[0-9]",
|
||
|
||
|
||
# Report name:
|
||
report: "report/alignPipeline.rst"
|
||
|
||
|
||
onsuccess:
|
||
print("Workflow finished, no error")
|
||
|
||
|
||
onerror:
|
||
print("An error occurred")
|
||
|
||
|
||
rule targets:
|
||
input:
|
||
"result/multiqc_report.html",
|
||
expand("result/fastqc/{sample}_fastqc.zip", sample=config["samples"]),
|
||
expand("result/bowtie/{sample}.bam.bai", sample=config["samples"]),
|
||
"result/featureCounts/counts_matrix.txt",
|
||
|
||
|
||
rule fastqc:
|
||
input:
|
||
"data/{sample}.fastq.gz",
|
||
output:
|
||
report(
|
||
multiext("result/fastqc/{sample}", "_fastqc.zip", "_fastqc.html"),
|
||
caption="report/fastqc.rst",
|
||
category="Step 1 fastqc",
|
||
),
|
||
log:
|
||
stdout="result/logs/{sample}_fastqc.stdout",
|
||
stderr="result/logs/{sample}_fastqc.stderr",
|
||
benchmark:
|
||
"result/benchmark/{sample}_fastqc.tsv"
|
||
params:
|
||
outdir="result/fastqc",
|
||
message:
|
||
"Executing Fastqc with {threads} threads on the following sample : {wildcards.sample}"
|
||
threads: 1
|
||
envmodules:
|
||
"fastqc/fastqc_v0.11.5",
|
||
resources:
|
||
mem="1gb",
|
||
time_min="00:01:00",
|
||
shell:
|
||
"""
|
||
fastqc {input} -outdir {params.outdir} 1> {log.stdout} 2> {log.stderr}
|
||
"""
|
||
|
||
|
||
rule bowtieIndex:
|
||
input:
|
||
fasta=lookup(dpath="references/fasta", within=config), # config["references"]["fasta"]
|
||
output:
|
||
idxFile=ensure(
|
||
multiext(
|
||
config["bowtie"]["idx"],
|
||
".1.bt2",
|
||
".2.bt2",
|
||
".3.bt2",
|
||
".4.bt2",
|
||
".rev.1.bt2",
|
||
".rev.2.bt2",
|
||
),
|
||
non_empty=True,
|
||
),
|
||
log:
|
||
"result/logs/fastqc/bowtieIndex.txt",
|
||
benchmark:
|
||
"result/benchmark/bowtieIndex.tsv"
|
||
params:
|
||
idx=config["bowtie"]["idx"],
|
||
message:
|
||
"Indexing genome"
|
||
threads: max(1, config["bowtie"]["idxthreads"])
|
||
envmodules:
|
||
"nodes/bowtie2-2.4.2",
|
||
resources:
|
||
mem="1gb",
|
||
time_min="00:05:00",
|
||
shell:
|
||
"""
|
||
bowtie2-build --threads {threads} {input.fasta} {params.idx} &> {log}
|
||
"""
|
||
|
||
|
||
rule bowtie2:
|
||
input:
|
||
fq="data/{sample}.fastq.gz",
|
||
idxFile=rules.bowtieIndex.output,
|
||
output:
|
||
bam=report(
|
||
pipe("result/bowtie/{sample}.sam"),
|
||
caption="report/bowtie2.rst",
|
||
category="Step 2 Bowtie2",
|
||
),
|
||
# not a good idea because bam files are heavy
|
||
log:
|
||
bwtout="result/logs/bowtie/{sample}_align.txt",
|
||
stderr="result/logs/bowtie/{sample}_align.stderr",
|
||
benchmark:
|
||
"result/benchmark/{sample}_bowtie2.tsv"
|
||
params:
|
||
extra=config["bowtie"]["extra"],
|
||
idx=config["bowtie"]["idx"],
|
||
tmpDir=config.get("tempDir", "tmp"),
|
||
message:
|
||
"Executing Bowtie2 with {threads} threads on the following sample: {wildcards.sample}"
|
||
threads: max(1, config["bowtie"]["alignthreads"])
|
||
envmodules:
|
||
"nodes/bowtie2-2.4.2",
|
||
resources:
|
||
mem="2gb",
|
||
time_min="00:05:00",
|
||
shell:
|
||
"""
|
||
bowtie2 -x {params.idx} -U {input.fq} -p {threads} {params.extra} \
|
||
2> {log.bwtout} > {output}
|
||
"""
|
||
|
||
|
||
rule samtoolsSort:
|
||
input:
|
||
"result/bowtie/{sample}.sam",
|
||
output:
|
||
protected("result/bowtie/{sample}.bam"),
|
||
log:
|
||
"result/logs/samtools/{sample}.txt",
|
||
benchmark:
|
||
"result/benchmark/{sample}_samtoolsSort.tsv"
|
||
envmodules:
|
||
"nodes/samtools-1.11",
|
||
threads: 1
|
||
resources:
|
||
mem="2gb",
|
||
time_min="00:05:00",
|
||
params:
|
||
tmpDir=config.get("tempDir", "tmp"),
|
||
shell:
|
||
"""
|
||
samtools sort -O 'bam' -T {params.tmpDir} {input} > {output}
|
||
"""
|
||
|
||
|
||
rule samtoolsIndx:
|
||
input:
|
||
bam="result/bowtie/{sample}.bam",
|
||
output:
|
||
bai="result/bowtie/{sample}.bam.bai",
|
||
log:
|
||
"result/logs/bowtie/{sample}_index.txt",
|
||
threads: 1
|
||
resources:
|
||
mem="1gb",
|
||
time_min="00:05:00",
|
||
envmodules:
|
||
"nodes/samtools-1.11",
|
||
shell:
|
||
"""
|
||
samtools index -b {input.bam} &> {log}
|
||
"""
|
||
|
||
|
||
rule featureCounts:
|
||
input:
|
||
bam="result/bowtie/{sample}.bam",
|
||
annot=config["references"]["annot"],
|
||
output:
|
||
report(
|
||
ensure("result/featureCounts/{sample}_ftc.txt", non_empty=True),
|
||
caption="report/featureCounts.rst",
|
||
category="Step 3 featureCounts",
|
||
),
|
||
log:
|
||
"result/logs/featureCounts/{sample}_ftc.txt",
|
||
threads: 1
|
||
resources:
|
||
mem="1gb",
|
||
time_min="00:10:00",
|
||
envmodules:
|
||
"subread/subread-1.5.2",
|
||
params:
|
||
extra=config["featureCounts"]["extra"],
|
||
shell:
|
||
"""
|
||
featureCounts {params.extra} -a {input.annot} -o {output} \
|
||
{input.bam} &> {log}
|
||
"""
|
||
|
||
|
||
rule extractCounts:
|
||
input:
|
||
"result/featureCounts/{sample}_ftc.txt",
|
||
output:
|
||
temp("result/featureCounts/{sample}_ftc{colExtra}.txt"),
|
||
log:
|
||
"result/logs/featureCounts/{sample}_extractCounts{colExtra}.txt",
|
||
threads: 1
|
||
resources:
|
||
mem="1gb",
|
||
time_min="00:05:00",
|
||
shell:
|
||
"""
|
||
cut -f {wildcards.colExtra} {input} | sed 1d > {output} 2> {log}
|
||
"""
|
||
|
||
|
||
rule matrixCounts:
|
||
input:
|
||
countfile=expand(
|
||
"result/featureCounts/{sample}_ftc{colExtra}.txt",
|
||
sample=config["samples"],
|
||
colExtra="7",
|
||
),
|
||
geneID=expand(
|
||
"result/featureCounts/{sample}_ftc{colExtra}.txt",
|
||
sample=config["samples"],
|
||
colExtra="1",
|
||
),
|
||
output:
|
||
"result/featureCounts/counts_matrix.txt",
|
||
log:
|
||
"result/logs/featureCounts/counts_matrix.txt",
|
||
threads: 1
|
||
resources:
|
||
mem="1gb",
|
||
time_min="00:05:00",
|
||
shell:
|
||
"""
|
||
paste {input.geneID[0]} {input.countfile} > {output} 2> {log}
|
||
"""
|
||
|
||
|
||
rule multiqc:
|
||
input:
|
||
expand(
|
||
"result/fastqc/{sample}_fastqc.zip",
|
||
sample=config["samples"],
|
||
allow_missing=True,
|
||
),
|
||
expand("result/fastqc/{sample}_fastqc.html", sample=config["samples"]),
|
||
expand("result/bowtie/{sample}.bam", sample=config["samples"]),
|
||
expand("result/featureCounts/{sample}_ftc.txt", sample=config["samples"]),
|
||
output:
|
||
"result/multiqc_report.html",
|
||
log:
|
||
report(
|
||
"result/logs/multiqc/multiqc.log",
|
||
caption="report/multiqc.rst",
|
||
category="Step 4 multiqc",
|
||
),
|
||
threads: 1
|
||
resources:
|
||
mem="2gb",
|
||
time_min="00:30:00",
|
||
params:
|
||
dir="result",
|
||
envmodules:
|
||
"nodes/multiqc-1.9",
|
||
shell:
|
||
"""
|
||
multiqc --outdir {params.dir} {params.dir} &> {log}
|
||
"""
|
demo_advanced/codes/profile/config.yaml | ||
---|---|---|
restart-times: 3
|
||
latency-wait: 180
|
||
software-deployment-method: env-modules
|
||
jobs: 16
|
||
keep-going: True
|
||
printshellcmds: True
|
demo_advanced/codes/report/alignPipeline.rst | ||
---|---|---|
**RNA-seq: Quality control and alignment**
|
||
|
||
# RNA-seq pipeline
|
||
|
||
## Step1: Fastqc
|
||
|
||
Check the sequence quality with Fastqc_
|
||
|
||
.. _Fastqc: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/
|
||
|
||
*Params:* default parameters
|
||
|
||
*Version:* fastqc/fastqc_v0.11.5
|
||
|
||
*Cite:* Andrews S. (2010). FastQC: a quality control tool for high throughput sequence data. Available online at: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/
|
||
|
||
|
||
## Step2: Bowtie2 and Samtools
|
||
|
||
Using Bowtie2_ for alignment on a reference genome. Samtools_ allowed to filter, index and convert format
|
||
|
||
.. _Bowtie2: https://bowtie-bio.sourceforge.net/bowtie2/index.shtml
|
||
.. _Samtools: http://www.htslib.org/
|
||
|
||
*Params:* {{ snakemake.config["bowtie"]["extra"] }}
|
||
|
||
*Version:*
|
||
|
||
- Bowtie2: nodes/bowtie2-2.4.2
|
||
- Samtools: nodes/samtools-1.11
|
||
|
||
*Cite:*
|
||
|
||
- Bowtie2: Langmead B, Salzberg S. Fast gapped-read alignment with Bowtie 2. Nature Methods. 2012, 9:357-359.
|
||
- Samtools: Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R, and 1000 Genome Project Data Processing Subgroup, The Sequence alignment/map (SAM) format and SAMtools, Bioinformatics (2009) 25(16) 2078-9
|
||
|
||
|
||
## Step3: featureCounts
|
||
|
||
Using FeatureCounts_ to quantify the level of gene expression
|
||
|
||
.. _FeatureCounts: https://subread.sourceforge.net/featureCounts.html
|
||
|
||
*Params:* {{ snakemake.config["featureCounts"]["extra"] }}
|
||
|
||
*Version:* subread/subread-1.5.2
|
||
|
||
*Cite:* Liao Y, Smyth GK and Shi W (2014). featureCounts: an efficient general purpose program for assigning sequence reads to genomic features. Bioinformatics, 30(7):923-30.
|
||
|
||
|
||
## Step4: Multiqc
|
||
|
||
With Multiqc_ combine all results in html (easy-to-use)
|
||
|
||
.. _Multiqc: https://multiqc.info/
|
||
|
||
*Params:* default parameters
|
||
|
||
*Version:* nodes/multiqc-1.9
|
||
|
||
|
||
*Cite:* MultiQC: Summarize analysis results for multiple tools and samples in a single report Philip Ewels, Måns Magnusson, Sverker Lundin and Max Käller. Bioinformatics (2016)
|
||
|
||
|
||
# Pipeline
|
||
|
||
- Snakemake_ : Mölder, F., Jablonski, K.P., Letcher, B., Hall, M.B., Tomkins-Tinch, C.H., Sochat, V., Forster, J., Lee, S., Twardziok, S.O., Kanitz, A., Wilm, A., Holtgrewe, M., Rahmann, S., Nahnsen, S., Köster, J., 2021. Sustainable data analysis with Snakemake. F1000Res 10, 33.
|
||
- Conda_
|
||
|
||
.. _Snakemake: https://snakemake.readthedocs.io/en/stable/
|
||
.. _Conda: https://docs.conda.io/en/latest/
|
demo_advanced/codes/report/bowtie2.rst | ||
---|---|---|
Bowtie2 and samtools
|
demo_advanced/codes/report/fastqc.rst | ||
---|---|---|
Running fastqc with default params
|
demo_advanced/codes/report/featureCounts.rst | ||
---|---|---|
FeatureCounts from Subread
|
demo_advanced/codes/report/multiqc.rst | ||
---|---|---|
Runiing with default params
|
demo_advanced/configfile.yaml | ||
---|---|---|
samples:
|
||
SRR3099585_chr18: SRR3099585_chr18.fastq.gz
|
||
SRR3099586_chr18: SRR3099586_chr18.fastq.gz
|
||
SRR3099587_chr18: SRR3099587_chr18.fastq.gz
|
||
SRR3105697_chr18: SRR3105697_chr18.fastq.gz
|
||
SRR3105698_chr18: SRR3105698_chr18.fastq.gz
|
||
SRR3105699_chr18: SRR3105699_chr18.fastq.gz
|
||
references:
|
||
annot: "references/O.tauri_annotation.gff"
|
||
fasta: "references/O.tauri_genome.fna"
|
||
bowtie:
|
||
idx: "references/bowtie2index/Otauri"
|
||
idxthreads: 1
|
||
extra: ""
|
||
alignthreads: 8
|
||
featureCounts:
|
||
extra: '-t "gene" -g "ID" -s "2"'
|
||
tempDir: "/tmp"
|
demo_advanced/readme_runSnake.txt | ||
---|---|---|
# Utilisation du Snakefile
|
||
|
||
Pour utiliser ce snakemake (tested with snakemake version 8.4.6) avec le cluster i2bc:
|
||
|
||
Le Snakefile a été reformaté avec le logiciel snakefmt (v0.10.0).
|
||
|
||
L'arborescence du code:
|
||
|
||
|
||
├── Snakefile
|
||
├── profile
|
||
│ └── config.yaml
|
||
└── report
|
||
├── alignPipeline.rst
|
||
├── bowtie2.rst
|
||
├── fastqc.rst
|
||
├── featureCounts.rst
|
||
└── multiqc.rst
|
||
|
||
Le snakefile définit toutes les règles, le profile contient des options utilisées souvent dans la ligne de commande et le dossier report contient les informations qui vont être inscrits dans le rapport.
|
||
|
||
Pour exécuter ce code, il faut aussi des données. Elles sont disponnibles sur Zenodo (https://zenodo.org/records/3997237). Ce sont des données single-end RNA-seq.
|
||
|
||
- Il faut mettre les données dans un repertoire : data
|
||
- Les annotations sont dans le dossier references mais ce nom peut changer grâce au fichier de configuration (configfile.yaml)
|
||
- Les résultats seront écrits dans le répertoire : result
|
||
- Dans le fichier runRnaseq.sh, il faut modifier le chemin vers le dossier projet.
|
||
|
||
Voila un exemple d'organisation du dossier de données mais aussi de résultat :
|
||
|
||
├── references
|
||
│ ├── O.tauri_annotation.gff
|
||
│ ├── O.tauri_genome.fna
|
||
│ └── bowtie2index
|
||
├── configfile.yaml
|
||
├── data
|
||
│ ├── SRR3099585_chr18.fastq.gz
|
||
│ ├── SRR3099586_chr18.fastq.gz
|
||
│ ├── SRR3099587_chr18.fastq.gz
|
||
│ ├── SRR3105697_chr18.fastq.gz
|
||
│ ├── SRR3105698_chr18.fastq.gz
|
||
│ └── SRR3105699_chr18.fastq.gz
|
||
├── result
|
||
│ ├── benchmark
|
||
│ ├── bowtie
|
||
│ ├── fastqc
|
||
│ ├── featureCounts
|
||
│ ├── logs
|
||
│ ├── multiqc_data
|
||
│ └── multiqc_report.html
|
||
└── runSnakemake.sh
|
||
|
||
|
||
La ligne de commande : qsub -V -l walltime=4:00:00 -l select=1:ncpus=1:mem=100mb runRnaseq.sh
|
||
|
||
Pour obtenir le rapport il faut relancer la même commande snakemake mais il faut ajouter l'option : --report myFinalReport.html
|
||
|
||
# Les fonctions utilisées
|
||
|
||
## Fonction général
|
||
|
||
- envmodules : pour spécifier les environnements utilisés pour chaque règles (https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html#using-environment-modules)
|
||
- wildcard_constraints : permet une verification de la valeur des variables (https://snakemake.readthedocs.io/en/stable/tutorial/additional_features.html#constraining-wildcards)
|
||
|
||
## rule fastqc
|
||
|
||
- Utilisation de la fonction report pour créer un rapport après exécution du script (https://snakemake.readthedocs.io/en/stable/snakefiles/reporting.html#reports)
|
||
- multiext : variant de la fonction expand (https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#the-multiext-function)
|
||
|
||
## rule bowtieIndex
|
||
|
||
- lookup : https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#the-lookup-function
|
||
- ensure : vérification si le fichier est vide (https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#ensuring-output-file-properties-like-non-emptyness-or-checksum-compliance)
|
||
- multiext : variant de la fonction expand (https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#the-multiext-function)
|
||
|
||
## rule bowtie2 et samtoolsSort
|
||
|
||
- pipe : permet de remplacer l'utilisation de pipe et de scinder les différents éléments de la ligne de commande en plusieurs règles (https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#piped-output)
|
||
- config.get : permet de mettre une valeur par défaut si elle manque dans le fichier de configuration
|
||
|
||
## rule extractCounts
|
||
|
||
- temp : suppression des fichiers une fois qu'ils ne sont plus utils (https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#protected-and-temporary-files)
|
||
|
||
## rule matrixCounts
|
||
|
||
- On peut utiliser autant de wildcards que l'on veut. Dans cette règle 2 wildcards sont utilisées.
|
demo_advanced/runRnaseq.sh | ||
---|---|---|
#!/bin/bash
|
||
|
||
module load snakemake/snakemake-8.4.6
|
||
|
||
cd /path/to/the/project/
|
||
|
||
snakemake --executor cluster-generic --cluster-generic-submit-cmd "qsub -V -l walltime={resources.time_min} -l select=1:ncpus={threads}:mem={resources.mem}" --snakefile codes/Snakefile --profile codes/profile --configfile configfile.yaml &> smkpipeline.txt
|
||
|
||
snakemake --executor cluster-generic --cluster-generic-submit-cmd "qsub -V -l walltime={resources.time_min} -l select=1:ncpus={threads}:mem={resources.mem}" --snakefile codes/Snakefile --profile codes/profile --configfile configfile.yaml --report smkRnaseq_report.html
|
exercise0/Snakefile | ||
---|---|---|
samples=["P10415", "P01308"]
|
||
|
||
rule targets:
|
||
input:
|
||
expand("fasta/{sample}.fasta", sample=samples),
|
||
"fusionFasta/allSequences.fasta",
|
||
"mafft/mafft_res.fasta",
|
||
|
||
|
||
rule loadData:
|
||
output:
|
||
"fasta/{sample}.fasta",
|
||
log:
|
||
stdout = "logs/{sample}_wget.stdout",
|
||
stderr = "logs/{sample}_wget.stderr",
|
||
params:
|
||
dirFasta = "fasta",
|
||
threads: 1
|
||
shell:
|
||
"""
|
||
wget --output-file {log.stderr} \
|
||
--directory-prefix {params.dirFasta} \
|
||
https://www.uniprot.org/uniprot/{wildcards.sample}.fasta > {log.stdout}
|
||
"""
|
||
|
||
|
||
rule fusionFasta:
|
||
input:
|
||
expand("fasta/{sample}.fasta", sample=samples),
|
||
output:
|
||
"fusionFasta/allSequences.fasta",
|
||
log:
|
||
"logs/fusionData.stderr",
|
||
threads: 1
|
||
shell:
|
||
"""
|
||
cat {input} > {output} 2> {log}
|
||
"""
|
||
|
||
|
||
rule mafft:
|
||
input:
|
||
rules.fusionFasta.output,
|
||
output:
|
||
"mafft/mafft_res.fasta",
|
||
log:
|
||
"logs/whichMafft.txt",
|
||
threads: 1
|
||
shell:
|
||
"""
|
||
mafft {input} > {output} 2> {log}
|
||
"""
|
exercise0/readme_runSnake.txt | ||
---|---|---|
Pour faire fonctionner le pipeline il faut se connecter sur un noeud du cluster puis:
|
||
|
||
- charger l'environnement snakemake:
|
||
module load snakemake/snakemake-8.4.6
|
||
module load nodes/mafft-7.475
|
||
|
||
- executer le programme:
|
||
snakemake --cores 4
|
readme_runSnake.txt | ||
---|---|---|
Pour faire fonctionner le pipeline il faut se connecter sur un noeud du cluster puis:
|
||
|
||
- charger l'environnement snakemake:
|
||
module load snakemake/snakemake-8.4.6
|
||
module load nodes/mafft-7.475
|
||
|
||
- executer le programme:
|
||
snakemake --cores 4
|
Also available in: Unified diff
feat: add advanced exercise/demo + move other files to exercise0/