Project

General

Profile

« Previous | Next » 

Revision 410d9e3d

Added by Chloé QUIGNOT 9 months ago

feat: add advanced exercise/demo + move other files to exercise0/

View differences:

Snakefile
samples=["P10415", "P01308"]
rule targets:
input:
expand("fasta/{sample}.fasta", sample=samples),
"fusionFasta/allSequences.fasta",
"mafft/mafft_res.fasta",
rule loadData:
output:
"fasta/{sample}.fasta",
log:
stdout = "logs/{sample}_wget.stdout",
stderr = "logs/{sample}_wget.stderr",
params:
dirFasta = "fasta",
threads: 1
shell:
"""
wget --output-file {log.stderr} \
--directory-prefix {params.dirFasta} \
https://www.uniprot.org/uniprot/{wildcards.sample}.fasta > {log.stdout}
"""
rule fusionFasta:
input:
expand("fasta/{sample}.fasta", sample=samples),
output:
"fusionFasta/allSequences.fasta",
log:
"logs/fusionData.stderr",
threads: 1
shell:
"""
cat {input} > {output} 2> {log}
"""
rule mafft:
input:
rules.fusionFasta.output,
output:
"mafft/mafft_res.fasta",
log:
"logs/whichMafft.txt",
threads: 1
shell:
"""
mafft {input} > {output} 2> {log}
"""
demo_advanced/codes/Snakefile
wildcard_constraints:
sample="[a-zA-Z0-9_]+", # matches Unicode word characters
colExtra="[0-9]",
# Report name:
report: "report/alignPipeline.rst"
onsuccess:
print("Workflow finished, no error")
onerror:
print("An error occurred")
rule targets:
input:
"result/multiqc_report.html",
expand("result/fastqc/{sample}_fastqc.zip", sample=config["samples"]),
expand("result/bowtie/{sample}.bam.bai", sample=config["samples"]),
"result/featureCounts/counts_matrix.txt",
rule fastqc:
input:
"data/{sample}.fastq.gz",
output:
report(
multiext("result/fastqc/{sample}", "_fastqc.zip", "_fastqc.html"),
caption="report/fastqc.rst",
category="Step 1 fastqc",
),
log:
stdout="result/logs/{sample}_fastqc.stdout",
stderr="result/logs/{sample}_fastqc.stderr",
benchmark:
"result/benchmark/{sample}_fastqc.tsv"
params:
outdir="result/fastqc",
message:
"Executing Fastqc with {threads} threads on the following sample : {wildcards.sample}"
threads: 1
envmodules:
"fastqc/fastqc_v0.11.5",
resources:
mem="1gb",
time_min="00:01:00",
shell:
"""
fastqc {input} -outdir {params.outdir} 1> {log.stdout} 2> {log.stderr}
"""
rule bowtieIndex:
input:
fasta=lookup(dpath="references/fasta", within=config), # config["references"]["fasta"]
output:
idxFile=ensure(
multiext(
config["bowtie"]["idx"],
".1.bt2",
".2.bt2",
".3.bt2",
".4.bt2",
".rev.1.bt2",
".rev.2.bt2",
),
non_empty=True,
),
log:
"result/logs/fastqc/bowtieIndex.txt",
benchmark:
"result/benchmark/bowtieIndex.tsv"
params:
idx=config["bowtie"]["idx"],
message:
"Indexing genome"
threads: max(1, config["bowtie"]["idxthreads"])
envmodules:
"nodes/bowtie2-2.4.2",
resources:
mem="1gb",
time_min="00:05:00",
shell:
"""
bowtie2-build --threads {threads} {input.fasta} {params.idx} &> {log}
"""
rule bowtie2:
input:
fq="data/{sample}.fastq.gz",
idxFile=rules.bowtieIndex.output,
output:
bam=report(
pipe("result/bowtie/{sample}.sam"),
caption="report/bowtie2.rst",
category="Step 2 Bowtie2",
),
# not a good idea because bam files are heavy
log:
bwtout="result/logs/bowtie/{sample}_align.txt",
stderr="result/logs/bowtie/{sample}_align.stderr",
benchmark:
"result/benchmark/{sample}_bowtie2.tsv"
params:
extra=config["bowtie"]["extra"],
idx=config["bowtie"]["idx"],
tmpDir=config.get("tempDir", "tmp"),
message:
"Executing Bowtie2 with {threads} threads on the following sample: {wildcards.sample}"
threads: max(1, config["bowtie"]["alignthreads"])
envmodules:
"nodes/bowtie2-2.4.2",
resources:
mem="2gb",
time_min="00:05:00",
shell:
"""
bowtie2 -x {params.idx} -U {input.fq} -p {threads} {params.extra} \
2> {log.bwtout} > {output}
"""
rule samtoolsSort:
input:
"result/bowtie/{sample}.sam",
output:
protected("result/bowtie/{sample}.bam"),
log:
"result/logs/samtools/{sample}.txt",
benchmark:
"result/benchmark/{sample}_samtoolsSort.tsv"
envmodules:
"nodes/samtools-1.11",
threads: 1
resources:
mem="2gb",
time_min="00:05:00",
params:
tmpDir=config.get("tempDir", "tmp"),
shell:
"""
samtools sort -O 'bam' -T {params.tmpDir} {input} > {output}
"""
rule samtoolsIndx:
input:
bam="result/bowtie/{sample}.bam",
output:
bai="result/bowtie/{sample}.bam.bai",
log:
"result/logs/bowtie/{sample}_index.txt",
threads: 1
resources:
mem="1gb",
time_min="00:05:00",
envmodules:
"nodes/samtools-1.11",
shell:
"""
samtools index -b {input.bam} &> {log}
"""
rule featureCounts:
input:
bam="result/bowtie/{sample}.bam",
annot=config["references"]["annot"],
output:
report(
ensure("result/featureCounts/{sample}_ftc.txt", non_empty=True),
caption="report/featureCounts.rst",
category="Step 3 featureCounts",
),
log:
"result/logs/featureCounts/{sample}_ftc.txt",
threads: 1
resources:
mem="1gb",
time_min="00:10:00",
envmodules:
"subread/subread-1.5.2",
params:
extra=config["featureCounts"]["extra"],
shell:
"""
featureCounts {params.extra} -a {input.annot} -o {output} \
{input.bam} &> {log}
"""
rule extractCounts:
input:
"result/featureCounts/{sample}_ftc.txt",
output:
temp("result/featureCounts/{sample}_ftc{colExtra}.txt"),
log:
"result/logs/featureCounts/{sample}_extractCounts{colExtra}.txt",
threads: 1
resources:
mem="1gb",
time_min="00:05:00",
shell:
"""
cut -f {wildcards.colExtra} {input} | sed 1d > {output} 2> {log}
"""
rule matrixCounts:
input:
countfile=expand(
"result/featureCounts/{sample}_ftc{colExtra}.txt",
sample=config["samples"],
colExtra="7",
),
geneID=expand(
"result/featureCounts/{sample}_ftc{colExtra}.txt",
sample=config["samples"],
colExtra="1",
),
output:
"result/featureCounts/counts_matrix.txt",
log:
"result/logs/featureCounts/counts_matrix.txt",
threads: 1
resources:
mem="1gb",
time_min="00:05:00",
shell:
"""
paste {input.geneID[0]} {input.countfile} > {output} 2> {log}
"""
rule multiqc:
input:
expand(
"result/fastqc/{sample}_fastqc.zip",
sample=config["samples"],
allow_missing=True,
),
expand("result/fastqc/{sample}_fastqc.html", sample=config["samples"]),
expand("result/bowtie/{sample}.bam", sample=config["samples"]),
expand("result/featureCounts/{sample}_ftc.txt", sample=config["samples"]),
output:
"result/multiqc_report.html",
log:
report(
"result/logs/multiqc/multiqc.log",
caption="report/multiqc.rst",
category="Step 4 multiqc",
),
threads: 1
resources:
mem="2gb",
time_min="00:30:00",
params:
dir="result",
envmodules:
"nodes/multiqc-1.9",
shell:
"""
multiqc --outdir {params.dir} {params.dir} &> {log}
"""
demo_advanced/codes/profile/config.yaml
restart-times: 3
latency-wait: 180
software-deployment-method: env-modules
jobs: 16
keep-going: True
printshellcmds: True
demo_advanced/codes/report/alignPipeline.rst
**RNA-seq: Quality control and alignment**
# RNA-seq pipeline
## Step1: Fastqc
Check the sequence quality with Fastqc_
.. _Fastqc: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/
*Params:* default parameters
*Version:* fastqc/fastqc_v0.11.5
*Cite:* Andrews S. (2010). FastQC: a quality control tool for high throughput sequence data. Available online at: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/
## Step2: Bowtie2 and Samtools
Using Bowtie2_ for alignment on a reference genome. Samtools_ allowed to filter, index and convert format
.. _Bowtie2: https://bowtie-bio.sourceforge.net/bowtie2/index.shtml
.. _Samtools: http://www.htslib.org/
*Params:* {{ snakemake.config["bowtie"]["extra"] }}
*Version:*
- Bowtie2: nodes/bowtie2-2.4.2
- Samtools: nodes/samtools-1.11
*Cite:*
- Bowtie2: Langmead B, Salzberg S. Fast gapped-read alignment with Bowtie 2. Nature Methods. 2012, 9:357-359.
- Samtools: Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R, and 1000 Genome Project Data Processing Subgroup, The Sequence alignment/map (SAM) format and SAMtools, Bioinformatics (2009) 25(16) 2078-9
## Step3: featureCounts
Using FeatureCounts_ to quantify the level of gene expression
.. _FeatureCounts: https://subread.sourceforge.net/featureCounts.html
*Params:* {{ snakemake.config["featureCounts"]["extra"] }}
*Version:* subread/subread-1.5.2
*Cite:* Liao Y, Smyth GK and Shi W (2014). featureCounts: an efficient general purpose program for assigning sequence reads to genomic features. Bioinformatics, 30(7):923-30.
## Step4: Multiqc
With Multiqc_ combine all results in html (easy-to-use)
.. _Multiqc: https://multiqc.info/
*Params:* default parameters
*Version:* nodes/multiqc-1.9
*Cite:* MultiQC: Summarize analysis results for multiple tools and samples in a single report Philip Ewels, Måns Magnusson, Sverker Lundin and Max Käller. Bioinformatics (2016)
# Pipeline
- Snakemake_ : Mölder, F., Jablonski, K.P., Letcher, B., Hall, M.B., Tomkins-Tinch, C.H., Sochat, V., Forster, J., Lee, S., Twardziok, S.O., Kanitz, A., Wilm, A., Holtgrewe, M., Rahmann, S., Nahnsen, S., Köster, J., 2021. Sustainable data analysis with Snakemake. F1000Res 10, 33.
- Conda_
.. _Snakemake: https://snakemake.readthedocs.io/en/stable/
.. _Conda: https://docs.conda.io/en/latest/
demo_advanced/codes/report/bowtie2.rst
Bowtie2 and samtools
demo_advanced/codes/report/fastqc.rst
Running fastqc with default params
demo_advanced/codes/report/featureCounts.rst
FeatureCounts from Subread
demo_advanced/codes/report/multiqc.rst
Runiing with default params
demo_advanced/configfile.yaml
samples:
SRR3099585_chr18: SRR3099585_chr18.fastq.gz
SRR3099586_chr18: SRR3099586_chr18.fastq.gz
SRR3099587_chr18: SRR3099587_chr18.fastq.gz
SRR3105697_chr18: SRR3105697_chr18.fastq.gz
SRR3105698_chr18: SRR3105698_chr18.fastq.gz
SRR3105699_chr18: SRR3105699_chr18.fastq.gz
references:
annot: "references/O.tauri_annotation.gff"
fasta: "references/O.tauri_genome.fna"
bowtie:
idx: "references/bowtie2index/Otauri"
idxthreads: 1
extra: ""
alignthreads: 8
featureCounts:
extra: '-t "gene" -g "ID" -s "2"'
tempDir: "/tmp"
demo_advanced/readme_runSnake.txt
# Utilisation du Snakefile
Pour utiliser ce snakemake (tested with snakemake version 8.4.6) avec le cluster i2bc:
Le Snakefile a été reformaté avec le logiciel snakefmt (v0.10.0).
L'arborescence du code:
├── Snakefile
├── profile
│   └── config.yaml
└── report
├── alignPipeline.rst
├── bowtie2.rst
├── fastqc.rst
├── featureCounts.rst
└── multiqc.rst
Le snakefile définit toutes les règles, le profile contient des options utilisées souvent dans la ligne de commande et le dossier report contient les informations qui vont être inscrits dans le rapport.
Pour exécuter ce code, il faut aussi des données. Elles sont disponnibles sur Zenodo (https://zenodo.org/records/3997237). Ce sont des données single-end RNA-seq.
- Il faut mettre les données dans un repertoire : data
- Les annotations sont dans le dossier references mais ce nom peut changer grâce au fichier de configuration (configfile.yaml)
- Les résultats seront écrits dans le répertoire : result
- Dans le fichier runRnaseq.sh, il faut modifier le chemin vers le dossier projet.
Voila un exemple d'organisation du dossier de données mais aussi de résultat :
├── references
│   ├── O.tauri_annotation.gff
│   ├── O.tauri_genome.fna
│   └── bowtie2index
├── configfile.yaml
├── data
│   ├── SRR3099585_chr18.fastq.gz
│   ├── SRR3099586_chr18.fastq.gz
│   ├── SRR3099587_chr18.fastq.gz
│   ├── SRR3105697_chr18.fastq.gz
│   ├── SRR3105698_chr18.fastq.gz
│   └── SRR3105699_chr18.fastq.gz
├── result
│   ├── benchmark
│   ├── bowtie
│   ├── fastqc
│   ├── featureCounts
│   ├── logs
│   ├── multiqc_data
│   └── multiqc_report.html
└── runSnakemake.sh
La ligne de commande : qsub -V -l walltime=4:00:00 -l select=1:ncpus=1:mem=100mb runRnaseq.sh
Pour obtenir le rapport il faut relancer la même commande snakemake mais il faut ajouter l'option : --report myFinalReport.html
# Les fonctions utilisées
## Fonction général
- envmodules : pour spécifier les environnements utilisés pour chaque règles (https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html#using-environment-modules)
- wildcard_constraints : permet une verification de la valeur des variables (https://snakemake.readthedocs.io/en/stable/tutorial/additional_features.html#constraining-wildcards)
## rule fastqc
- Utilisation de la fonction report pour créer un rapport après exécution du script (https://snakemake.readthedocs.io/en/stable/snakefiles/reporting.html#reports)
- multiext : variant de la fonction expand (https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#the-multiext-function)
## rule bowtieIndex
- lookup : https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#the-lookup-function
- ensure : vérification si le fichier est vide (https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#ensuring-output-file-properties-like-non-emptyness-or-checksum-compliance)
- multiext : variant de la fonction expand (https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#the-multiext-function)
## rule bowtie2 et samtoolsSort
- pipe : permet de remplacer l'utilisation de pipe et de scinder les différents éléments de la ligne de commande en plusieurs règles (https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#piped-output)
- config.get : permet de mettre une valeur par défaut si elle manque dans le fichier de configuration
## rule extractCounts
- temp : suppression des fichiers une fois qu'ils ne sont plus utils (https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#protected-and-temporary-files)
## rule matrixCounts
- On peut utiliser autant de wildcards que l'on veut. Dans cette règle 2 wildcards sont utilisées.
demo_advanced/runRnaseq.sh
#!/bin/bash
module load snakemake/snakemake-8.4.6
cd /path/to/the/project/
snakemake --executor cluster-generic --cluster-generic-submit-cmd "qsub -V -l walltime={resources.time_min} -l select=1:ncpus={threads}:mem={resources.mem}" --snakefile codes/Snakefile --profile codes/profile --configfile configfile.yaml &> smkpipeline.txt
snakemake --executor cluster-generic --cluster-generic-submit-cmd "qsub -V -l walltime={resources.time_min} -l select=1:ncpus={threads}:mem={resources.mem}" --snakefile codes/Snakefile --profile codes/profile --configfile configfile.yaml --report smkRnaseq_report.html
exercise0/Snakefile
samples=["P10415", "P01308"]
rule targets:
input:
expand("fasta/{sample}.fasta", sample=samples),
"fusionFasta/allSequences.fasta",
"mafft/mafft_res.fasta",
rule loadData:
output:
"fasta/{sample}.fasta",
log:
stdout = "logs/{sample}_wget.stdout",
stderr = "logs/{sample}_wget.stderr",
params:
dirFasta = "fasta",
threads: 1
shell:
"""
wget --output-file {log.stderr} \
--directory-prefix {params.dirFasta} \
https://www.uniprot.org/uniprot/{wildcards.sample}.fasta > {log.stdout}
"""
rule fusionFasta:
input:
expand("fasta/{sample}.fasta", sample=samples),
output:
"fusionFasta/allSequences.fasta",
log:
"logs/fusionData.stderr",
threads: 1
shell:
"""
cat {input} > {output} 2> {log}
"""
rule mafft:
input:
rules.fusionFasta.output,
output:
"mafft/mafft_res.fasta",
log:
"logs/whichMafft.txt",
threads: 1
shell:
"""
mafft {input} > {output} 2> {log}
"""
exercise0/readme_runSnake.txt
Pour faire fonctionner le pipeline il faut se connecter sur un noeud du cluster puis:
- charger l'environnement snakemake:
module load snakemake/snakemake-8.4.6
module load nodes/mafft-7.475
- executer le programme:
snakemake --cores 4
readme_runSnake.txt
Pour faire fonctionner le pipeline il faut se connecter sur un noeud du cluster puis:
- charger l'environnement snakemake:
module load snakemake/snakemake-8.4.6
module load nodes/mafft-7.475
- executer le programme:
snakemake --cores 4

Also available in: Unified diff