«
Previous
|
Next
»
Revision d1a8897a
Added by Chloé QUIGNOT 7 months ago
- ID d1a8897a04d290ec1625d733e784b5d3aa54f235
- Parent 387c5cc5
demo_advanced/codes/Snakefile | ||
---|---|---|
wildcard_constraints:
|
||
sample="[a-zA-Z0-9_]+", # matches Unicode word characters
|
||
colExtra="[0-9]",
|
||
|
||
|
||
# Report name:
|
||
report: "report/alignPipeline.rst"
|
||
|
||
|
||
onsuccess:
|
||
print("Workflow finished, no error")
|
||
|
||
|
||
onerror:
|
||
print("An error occurred")
|
||
|
||
|
||
rule targets:
|
||
input:
|
||
"result/multiqc_report.html",
|
||
expand("result/fastqc/{sample}_fastqc.zip", sample=config["samples"]),
|
||
expand("result/bowtie/{sample}.bam.bai", sample=config["samples"]),
|
||
"result/featureCounts/counts_matrix.txt",
|
||
|
||
|
||
rule fastqc:
|
||
input:
|
||
lambda wc: config["samples"][wc.sample]
|
||
output:
|
||
report(
|
||
multiext("result/fastqc/{sample}", "_fastqc.zip", "_fastqc.html"),
|
||
caption="report/fastqc.rst",
|
||
category="Step 1 fastqc",
|
||
),
|
||
log:
|
||
stdout="result/logs/{sample}_fastqc.stdout",
|
||
stderr="result/logs/{sample}_fastqc.stderr",
|
||
benchmark:
|
||
"result/benchmark/{sample}_fastqc.tsv"
|
||
params:
|
||
outdir="result/fastqc",
|
||
message:
|
||
"Executing Fastqc with {threads} threads on the following sample : {wildcards.sample}"
|
||
threads: 1
|
||
envmodules:
|
||
"fastqc/fastqc_v0.11.5",
|
||
resources:
|
||
mem="1gb",
|
||
time_min="00:01:00",
|
||
shell:
|
||
"""
|
||
fastqc {input} -outdir {params.outdir} 1> {log.stdout} 2> {log.stderr}
|
||
"""
|
||
|
||
|
||
rule bowtieIndex:
|
||
input:
|
||
fasta=lookup(dpath="references/fasta", within=config), # config["references"]["fasta"]
|
||
output:
|
||
idxFile=ensure(
|
||
multiext(
|
||
config["bowtie"]["idx"],
|
||
".1.bt2",
|
||
".2.bt2",
|
||
".3.bt2",
|
||
".4.bt2",
|
||
".rev.1.bt2",
|
||
".rev.2.bt2",
|
||
),
|
||
non_empty=True,
|
||
),
|
||
log:
|
||
"result/logs/fastqc/bowtieIndex.txt",
|
||
benchmark:
|
||
"result/benchmark/bowtieIndex.tsv"
|
||
params:
|
||
idx=config["bowtie"]["idx"],
|
||
message:
|
||
"Indexing genome"
|
||
threads: max(1, config["bowtie"]["idxthreads"])
|
||
envmodules:
|
||
"nodes/bowtie2-2.4.2",
|
||
resources:
|
||
mem="1gb",
|
||
time_min="00:05:00",
|
||
shell:
|
||
"""
|
||
bowtie2-build --threads {threads} {input.fasta} {params.idx} &> {log}
|
||
"""
|
||
|
||
|
||
rule bowtie2:
|
||
input:
|
||
fq=lambda wc: config["samples"][wc.sample],
|
||
idxFile=rules.bowtieIndex.output,
|
||
output:
|
||
bam=pipe("result/bowtie/{sample}.sam"),
|
||
log:
|
||
bwtout="result/logs/bowtie/{sample}_align.txt",
|
||
stderr="result/logs/bowtie/{sample}_align.stderr",
|
||
benchmark:
|
||
"result/benchmark/{sample}_bowtie2.tsv"
|
||
params:
|
||
extra=config["bowtie"]["extra"],
|
||
idx=config["bowtie"]["idx"],
|
||
tmpDir=config.get("tempDir", "tmp"),
|
||
message:
|
||
"Executing Bowtie2 with {threads} threads on the following sample: {wildcards.sample}"
|
||
threads: max(1, config["bowtie"]["alignthreads"])
|
||
envmodules:
|
||
"nodes/bowtie2-2.4.2",
|
||
resources:
|
||
mem="2gb",
|
||
time_min="00:05:00",
|
||
shell:
|
||
"""
|
||
bowtie2 -x {params.idx} -U {input.fq} -p {threads} {params.extra} \
|
||
2> {log.bwtout} > {output}
|
||
"""
|
||
|
||
|
||
rule samtoolsSort:
|
||
input:
|
||
"result/bowtie/{sample}.sam",
|
||
output:
|
||
report(
|
||
protected("result/bowtie/{sample}.bam"),
|
||
caption="report/bowtie2.rst",
|
||
category="Step 2 Bowtie2",
|
||
),
|
||
# not a good idea because bam files are heavy
|
||
log:
|
||
"result/logs/samtools/{sample}.txt",
|
||
benchmark:
|
||
"result/benchmark/{sample}_samtoolsSort.tsv"
|
||
envmodules:
|
||
"nodes/samtools-1.11",
|
||
threads: 1
|
||
resources:
|
||
mem="2gb",
|
||
time_min="00:05:00",
|
||
params:
|
||
tmpDir=config.get("tempDir", "tmp"),
|
||
shell:
|
||
"""
|
||
samtools sort -O 'bam' -T {params.tmpDir} {input} > {output}
|
||
"""
|
||
|
||
|
||
rule samtoolsIndx:
|
||
input:
|
||
bam="result/bowtie/{sample}.bam",
|
||
output:
|
||
bai="result/bowtie/{sample}.bam.bai",
|
||
log:
|
||
"result/logs/bowtie/{sample}_index.txt",
|
||
threads: 1
|
||
resources:
|
||
mem="1gb",
|
||
time_min="00:05:00",
|
||
envmodules:
|
||
"nodes/samtools-1.11",
|
||
shell:
|
||
"""
|
||
samtools index -b {input.bam} &> {log}
|
||
"""
|
||
|
||
|
||
rule featureCounts:
|
||
input:
|
||
bam="result/bowtie/{sample}.bam",
|
||
annot=config["references"]["annot"],
|
||
output:
|
||
report(
|
||
ensure("result/featureCounts/{sample}_ftc.txt", non_empty=True),
|
||
caption="report/featureCounts.rst",
|
||
category="Step 3 featureCounts",
|
||
),
|
||
log:
|
||
"result/logs/featureCounts/{sample}_ftc.txt",
|
||
threads: 1
|
||
resources:
|
||
mem="1gb",
|
||
time_min="00:10:00",
|
||
envmodules:
|
||
"subread/subread-1.5.2",
|
||
params:
|
||
extra=config["featureCounts"]["extra"],
|
||
shell:
|
||
"""
|
||
featureCounts {params.extra} -a {input.annot} -o {output} \
|
||
{input.bam} &> {log}
|
||
"""
|
||
|
||
|
||
rule extractCounts:
|
||
input:
|
||
"result/featureCounts/{sample}_ftc.txt",
|
||
output:
|
||
temp("result/featureCounts/{sample}_ftc{colExtra}.txt"),
|
||
log:
|
||
"result/logs/featureCounts/{sample}_extractCounts{colExtra}.txt",
|
||
threads: 1
|
||
resources:
|
||
mem="1gb",
|
||
time_min="00:05:00",
|
||
shell:
|
||
"""
|
||
cut -f {wildcards.colExtra} {input} | sed 1d > {output} 2> {log}
|
||
"""
|
||
|
||
|
||
rule matrixCounts:
|
||
input:
|
||
countfile=expand(
|
||
"result/featureCounts/{sample}_ftc{colExtra}.txt",
|
||
sample=config["samples"],
|
||
colExtra="7",
|
||
),
|
||
geneID=expand(
|
||
"result/featureCounts/{sample}_ftc{colExtra}.txt",
|
||
sample=config["samples"],
|
||
colExtra="1",
|
||
),
|
||
output:
|
||
"result/featureCounts/counts_matrix.txt",
|
||
log:
|
||
"result/logs/featureCounts/counts_matrix.txt",
|
||
threads: 1
|
||
resources:
|
||
mem="1gb",
|
||
time_min="00:05:00",
|
||
shell:
|
||
"""
|
||
paste {input.geneID[0]} {input.countfile} > {output} 2> {log}
|
||
"""
|
||
|
||
|
||
rule multiqc:
|
||
input:
|
||
expand(
|
||
"result/fastqc/{sample}_fastqc.zip",
|
||
sample=config["samples"],
|
||
allow_missing=True,
|
||
),
|
||
expand("result/fastqc/{sample}_fastqc.html", sample=config["samples"]),
|
||
expand("result/bowtie/{sample}.bam", sample=config["samples"]),
|
||
expand("result/featureCounts/{sample}_ftc.txt", sample=config["samples"]),
|
||
output:
|
||
"result/multiqc_report.html",
|
||
log:
|
||
report(
|
||
"result/logs/multiqc/multiqc.log",
|
||
caption="report/multiqc.rst",
|
||
category="Step 4 multiqc",
|
||
),
|
||
threads: 1
|
||
resources:
|
||
mem="2gb",
|
||
time_min="00:30:00",
|
||
params:
|
||
dir="result",
|
||
envmodules:
|
||
"nodes/multiqc-1.9",
|
||
shell:
|
||
"""
|
||
multiqc --outdir {params.dir} {params.dir} &> {log}
|
||
"""
|
demo_advanced/codes/profile/config.yaml | ||
---|---|---|
restart-times: 3
|
||
latency-wait: 180
|
||
software-deployment-method: env-modules
|
||
jobs: 16
|
||
keep-going: True
|
||
printshellcmds: True
|
demo_advanced/codes/report/alignPipeline.rst | ||
---|---|---|
**RNA-seq: Quality control and alignment**
|
||
|
||
# RNA-seq pipeline
|
||
|
||
## Step1: Fastqc
|
||
|
||
Check the sequence quality with Fastqc_
|
||
|
||
.. _Fastqc: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/
|
||
|
||
*Params:* default parameters
|
||
|
||
*Version:* fastqc/fastqc_v0.11.5
|
||
|
||
*Cite:* Andrews S. (2010). FastQC: a quality control tool for high throughput sequence data. Available online at: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/
|
||
|
||
|
||
## Step2: Bowtie2 and Samtools
|
||
|
||
Using Bowtie2_ for alignment on a reference genome. Samtools_ allowed to filter, index and convert format
|
||
|
||
.. _Bowtie2: https://bowtie-bio.sourceforge.net/bowtie2/index.shtml
|
||
.. _Samtools: http://www.htslib.org/
|
||
|
||
*Params:* {{ snakemake.config["bowtie"]["extra"] }}
|
||
|
||
*Version:*
|
||
|
||
- Bowtie2: nodes/bowtie2-2.4.2
|
||
- Samtools: nodes/samtools-1.11
|
||
|
||
*Cite:*
|
||
|
||
- Bowtie2: Langmead B, Salzberg S. Fast gapped-read alignment with Bowtie 2. Nature Methods. 2012, 9:357-359.
|
||
- Samtools: Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R, and 1000 Genome Project Data Processing Subgroup, The Sequence alignment/map (SAM) format and SAMtools, Bioinformatics (2009) 25(16) 2078-9
|
||
|
||
|
||
## Step3: featureCounts
|
||
|
||
Using FeatureCounts_ to quantify the level of gene expression
|
||
|
||
.. _FeatureCounts: https://subread.sourceforge.net/featureCounts.html
|
||
|
||
*Params:* {{ snakemake.config["featureCounts"]["extra"] }}
|
||
|
||
*Version:* subread/subread-1.5.2
|
||
|
||
*Cite:* Liao Y, Smyth GK and Shi W (2014). featureCounts: an efficient general purpose program for assigning sequence reads to genomic features. Bioinformatics, 30(7):923-30.
|
||
|
||
|
||
## Step4: Multiqc
|
||
|
||
With Multiqc_ combine all results in html (easy-to-use)
|
||
|
||
.. _Multiqc: https://multiqc.info/
|
||
|
||
*Params:* default parameters
|
||
|
||
*Version:* nodes/multiqc-1.9
|
||
|
||
|
||
*Cite:* MultiQC: Summarize analysis results for multiple tools and samples in a single report Philip Ewels, Måns Magnusson, Sverker Lundin and Max Käller. Bioinformatics (2016)
|
||
|
||
|
||
# Pipeline
|
||
|
||
- Snakemake_ : Mölder, F., Jablonski, K.P., Letcher, B., Hall, M.B., Tomkins-Tinch, C.H., Sochat, V., Forster, J., Lee, S., Twardziok, S.O., Kanitz, A., Wilm, A., Holtgrewe, M., Rahmann, S., Nahnsen, S., Köster, J., 2021. Sustainable data analysis with Snakemake. F1000Res 10, 33.
|
||
- Conda_
|
||
|
||
.. _Snakemake: https://snakemake.readthedocs.io/en/stable/
|
||
.. _Conda: https://docs.conda.io/en/latest/
|
demo_advanced/codes/report/bowtie2.rst | ||
---|---|---|
Bowtie2 and samtools
|
demo_advanced/codes/report/fastqc.rst | ||
---|---|---|
Running fastqc with default params
|
demo_advanced/codes/report/featureCounts.rst | ||
---|---|---|
FeatureCounts from Subread
|
demo_advanced/codes/report/multiqc.rst | ||
---|---|---|
Runiing with default params
|
demo_advanced/configfile.yaml | ||
---|---|---|
samples:
|
||
SRR3099585_chr18: Data/SRR3099585_chr18.fastq.gz
|
||
SRR3099586_chr18: Data/SRR3099586_chr18.fastq.gz
|
||
SRR3099587_chr18: Data/SRR3099587_chr18.fastq.gz
|
||
SRR3105697_chr18: Data/SRR3105697_chr18.fastq.gz
|
||
SRR3105698_chr18: Data/SRR3105698_chr18.fastq.gz
|
||
SRR3105699_chr18: Data/SRR3105699_chr18.fastq.gz
|
||
references:
|
||
annot: "Data/O.tauri_annotation.gff"
|
||
fasta: "Data/O.tauri_genome.fna"
|
||
bowtie:
|
||
idx: "bowtie2index/Otauri"
|
||
idxthreads: 1
|
||
extra: ""
|
||
alignthreads: 8
|
||
featureCounts:
|
||
extra: '-t "gene" -g "ID" -s "2"'
|
||
tempDir: "/tmp"
|
demo_advanced/readme_runSnake.txt | ||
---|---|---|
# Snakemake pipeline presentation
|
||
|
||
## The code
|
||
|
||
The pipeline's `code` folder contains the following arborescence of files:
|
||
|
||
```text
|
||
├── Snakefile
|
||
├── profile
|
||
│ └── config.yaml
|
||
└── report
|
||
├── alignPipeline.rst
|
||
├── bowtie2.rst
|
||
├── fastqc.rst
|
||
├── featureCounts.rst
|
||
└── multiqc.rst
|
||
```
|
||
|
||
The Snakefile is the file that contains all of Snakemake's rules. The configuration file `profile/config.yaml`
|
||
is a way of specifying options that you would otherwise add to the snakemake command line. The `report` folder
|
||
contains a set of hand-written report templates for each tool or method that was used and that will later
|
||
be incorporated into a final report file generated by Snakemake.
|
||
|
||
NB: this Snakefile was reformatted using the snakefmt tool (v0.10.0) in order to comply with the commonly
|
||
accepted (and encouraged) Snakemake formatting standards
|
||
|
||
## The required data (to download)
|
||
|
||
This pipeline runs on single-end bulk RNA-seq data and outputs quality reports on the samples provided as well as tables of
|
||
gene expression counts per sample.
|
||
|
||
Example input files can be found under (this link)[https://doi.org/10.5281/zenodo.8340293] (same as in Exercises 1A, 1B and 1C, see (this page)[https://bioi2.i2bc.paris-saclay.fr/training/snakemake/exercises]):
|
||
|
||
```text
|
||
Data/
|
||
├── O.tauri_annotation.gff
|
||
├── O.tauri_genome.fna
|
||
├── SRR3099585_chr18.fastq.gz
|
||
├── SRR3099586_chr18.fastq.gz
|
||
├── SRR3099587_chr18.fastq.gz
|
||
├── SRR3105697_chr18.fastq.gz
|
||
├── SRR3105698_chr18.fastq.gz
|
||
└── SRR3105699_chr18.fastq.gz
|
||
```
|
||
|
||
They contain 6 fastq files (chromosome 18 from SRR309958[5,6,7] and SRR310569[7,8,9]), as well as
|
||
the genome fasta of their reference species *O. tauri* and its annotation.
|
||
|
||
# Usage
|
||
|
||
**Requirements:**
|
||
- tools: snakemake, fastqc, bowtie, featureCounts, multiqc
|
||
- data: fastq files and their corresponding reference genome
|
||
|
||
**Before you run the pipeline:**
|
||
- edit the line `cd /path/to/the/project/` in the `runRnaseq.sh` file to your actual project folder e.g. `/path/to/snakemake_examples/exercise2_advanced/`
|
||
- edit the configuration file `configfile.yaml` so that the paths to the various samples and reference files are correct (a good habit is to use absolute paths) so that Snakemake can find them
|
||
|
||
For example, if my working directories look like this:
|
||
|
||
```text
|
||
/home/john.doe/snakemake_tutorial/Data/
|
||
├── O.tauri_annotation.gff
|
||
├── O.tauri_genome.fna
|
||
├── SRR3099585_chr18.fastq.gz
|
||
├── SRR3099586_chr18.fastq.gz
|
||
├── SRR3099587_chr18.fastq.gz
|
||
├── SRR3105697_chr18.fastq.gz
|
||
├── SRR3105698_chr18.fastq.gz
|
||
└── SRR3105699_chr18.fastq.gz
|
||
```
|
||
and
|
||
```text
|
||
/home/john.doe/snakemake_examples/exercise2_advanced/code/
|
||
├── Snakefile
|
||
├── profile
|
||
│ └── config.yaml
|
||
└── report
|
||
├── alignPipeline.rst
|
||
├── bowtie2.rst
|
||
├── fastqc.rst
|
||
├── featureCounts.rst
|
||
└── multiqc.rst
|
||
```
|
||
|
||
Then my configuration file should look like this (for example):
|
||
|
||
```text
|
||
samples:
|
||
SRR3099585_chr18: /home/john.doe/snakemake_tutorial/Data/SRR3099585_chr18.fastq.gz
|
||
SRR3099586_chr18: /home/john.doe/snakemake_tutorial/Data/SRR3099586_chr18.fastq.gz
|
||
SRR3099587_chr18: /home/john.doe/snakemake_tutorial/Data/SRR3099587_chr18.fastq.gz
|
||
SRR3105697_chr18: /home/john.doe/snakemake_tutorial/Data/SRR3105697_chr18.fastq.gz
|
||
SRR3105698_chr18: /home/john.doe/snakemake_tutorial/Data/SRR3105698_chr18.fastq.gz
|
||
SRR3105699_chr18: /home/john.doe/snakemake_tutorial/Data/SRR3105699_chr18.fastq.gz
|
||
references:
|
||
annot: "/home/john.doe/snakemake_tutorial/Data/O.tauri_annotation.gff"
|
||
fasta: "/home/john.doe/snakemake_tutorial/Data/O.tauri_genome.fna"
|
||
bowtie:
|
||
idx: "/home/john.doe/snakemake_examples/exercise2_advanced/result/bowtie2index/Otauri"
|
||
idxthreads: 1
|
||
extra: ""
|
||
alignthreads: 8
|
||
featureCounts:
|
||
extra: '-t "gene" -g "ID" -s "2"'
|
||
tempDir: "/tmp"
|
||
```
|
||
|
||
**Running the pipeline:**
|
||
|
||
Outputs will be generated in the current working directory (i.e. the directory in which you
|
||
are when you run the Snakemake command), within a `result` folder.
|
||
|
||
To use this Snakemake pipeline (tested with snakemake version 8.4.6) on the I2BC cluster you can use the little bash script provided:
|
||
|
||
```bash
|
||
qsub -V -l walltime=4:00:00 -l select=1:ncpus=1:mem=100mb runRnaseq.sh
|
||
```
|
||
|
||
If you have a look at the runRnaseq.sh script, you will see that we run snakemake twice: the first time to run the pipeline and
|
||
generate the output files; the second time to create the final report using the `--report myFinalReport.html` option.
|
||
|
||
|
||
# Output
|
||
|
||
```text
|
||
result/
|
||
├── benchmark
|
||
├── bowtie
|
||
├── fastqc
|
||
├── featureCounts
|
||
├── logs
|
||
├── multiqc_data
|
||
└── multiqc_report.html
|
||
```
|
||
|
||
|
||
# Les fonctions utilisées
|
||
|
||
## Fonctions générales
|
||
|
||
- envmodules : pour spécifier les environnements utilisés pour chaque règles (https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html#using-environment-modules)
|
||
- wildcard_constraints : permet une verification de la valeur des variables (https://snakemake.readthedocs.io/en/stable/tutorial/additional_features.html#constraining-wildcards)
|
||
|
||
## rule fastqc
|
||
|
||
- Utilisation d'une fonction au lieu d'un nom de fichier pour l'input (https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#input-functions)
|
||
- Utilisation de la fonction report pour créer un rapport après exécution du script (https://snakemake.readthedocs.io/en/stable/snakefiles/reporting.html#reports)
|
||
- multiext : variant de la fonction expand (https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#the-multiext-function)
|
||
|
||
## rule bowtieIndex
|
||
|
||
- lookup : https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#the-lookup-function
|
||
- ensure : vérification si le fichier est vide (https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#ensuring-output-file-properties-like-non-emptyness-or-checksum-compliance)
|
||
- multiext : variant de la fonction expand (https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#the-multiext-function)
|
||
|
||
## rule bowtie2 et samtoolsSort
|
||
|
||
- Utilisation d'une fonction au lieu d'un nom de fichier pour l'input (https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#input-functions)
|
||
- pipe : permet de remplacer l'utilisation de pipe et de scinder les différents éléments de la ligne de commande en plusieurs règles (https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#piped-output)
|
||
- config.get : permet de mettre une valeur par défaut si elle manque dans le fichier de configuration (NB: c'est une fonction de Python qui marche ici car config est un dictionnaire)
|
||
|
||
## rule extractCounts
|
||
|
||
- temp : suppression des fichiers une fois qu'ils ne sont plus utils (https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#protected-and-temporary-files)
|
||
|
||
## rule matrixCounts
|
||
|
||
- On peut utiliser autant de wildcards que l'on veut. Dans cette règle 2 wildcards sont utilisées (sample et colExtra).
|
demo_advanced/runRnaseq.sh | ||
---|---|---|
#!/bin/bash
|
||
|
||
module load snakemake/snakemake-8.4.6
|
||
|
||
cd /path/to/the/project/
|
||
|
||
# create directory for the pbs logs
|
||
mkdir pbs
|
||
|
||
snakemake --executor cluster-generic --cluster-generic-submit-cmd "qsub -V -l walltime={resources.time_min} -l select=1:ncpus={threads}:mem={resources.mem} -e pbs/{name}_$PBS_JOBID.err -o pbs/{name}_$PBS_JOBID.out" --snakefile codes/Snakefile --profile codes/profile --configfile configfile.yaml &> smkpipeline.txt
|
||
|
||
snakemake --executor cluster-generic --cluster-generic-submit-cmd "qsub -V -l walltime={resources.time_min} -l select=1:ncpus={threads}:mem={resources.mem}" --snakefile codes/Snakefile --profile codes/profile --configfile configfile.yaml --report smkRnaseq_report.html
|
exercise2_advanced/codes/Snakefile | ||
---|---|---|
wildcard_constraints:
|
||
sample="[a-zA-Z0-9_]+", # matches Unicode word characters
|
||
colExtra="[0-9]",
|
||
|
||
|
||
# Report name:
|
||
report: "report/alignPipeline.rst"
|
||
|
||
|
||
onsuccess:
|
||
print("Workflow finished, no error")
|
||
|
||
|
||
onerror:
|
||
print("An error occurred")
|
||
|
||
|
||
rule targets:
|
||
input:
|
||
"result/multiqc_report.html",
|
||
expand("result/fastqc/{sample}_fastqc.zip", sample=config["samples"]),
|
||
expand("result/bowtie/{sample}.bam.bai", sample=config["samples"]),
|
||
"result/featureCounts/counts_matrix.txt",
|
||
|
||
|
||
rule fastqc:
|
||
input:
|
||
lambda wc: config["samples"][wc.sample]
|
||
output:
|
||
report(
|
||
multiext("result/fastqc/{sample}", "_fastqc.zip", "_fastqc.html"),
|
||
caption="report/fastqc.rst",
|
||
category="Step 1 fastqc",
|
||
),
|
||
log:
|
||
stdout="result/logs/{sample}_fastqc.stdout",
|
||
stderr="result/logs/{sample}_fastqc.stderr",
|
||
benchmark:
|
||
"result/benchmark/{sample}_fastqc.tsv"
|
||
params:
|
||
outdir="result/fastqc",
|
||
message:
|
||
"Executing Fastqc with {threads} threads on the following sample : {wildcards.sample}"
|
||
threads: 1
|
||
envmodules:
|
||
"fastqc/fastqc_v0.11.5",
|
||
resources:
|
||
mem="1gb",
|
||
time_min="00:01:00",
|
||
shell:
|
||
"""
|
||
fastqc {input} -outdir {params.outdir} 1> {log.stdout} 2> {log.stderr}
|
||
"""
|
||
|
||
|
||
rule bowtieIndex:
|
||
input:
|
||
fasta=lookup(dpath="references/fasta", within=config), # config["references"]["fasta"]
|
||
output:
|
||
idxFile=ensure(
|
||
multiext(
|
||
config["bowtie"]["idx"],
|
||
".1.bt2",
|
||
".2.bt2",
|
||
".3.bt2",
|
||
".4.bt2",
|
||
".rev.1.bt2",
|
||
".rev.2.bt2",
|
||
),
|
||
non_empty=True,
|
||
),
|
||
log:
|
||
"result/logs/fastqc/bowtieIndex.txt",
|
||
benchmark:
|
||
"result/benchmark/bowtieIndex.tsv"
|
||
params:
|
||
idx=config["bowtie"]["idx"],
|
||
message:
|
||
"Indexing genome"
|
||
threads: max(1, config["bowtie"]["idxthreads"])
|
||
envmodules:
|
||
"nodes/bowtie2-2.4.2",
|
||
resources:
|
||
mem="1gb",
|
||
time_min="00:05:00",
|
||
shell:
|
||
"""
|
||
bowtie2-build --threads {threads} {input.fasta} {params.idx} &> {log}
|
||
"""
|
||
|
||
|
||
rule bowtie2:
|
||
input:
|
||
fq=lambda wc: config["samples"][wc.sample],
|
||
idxFile=rules.bowtieIndex.output,
|
||
output:
|
||
bam=pipe("result/bowtie/{sample}.sam"),
|
||
log:
|
||
bwtout="result/logs/bowtie/{sample}_align.txt",
|
||
stderr="result/logs/bowtie/{sample}_align.stderr",
|
||
benchmark:
|
||
"result/benchmark/{sample}_bowtie2.tsv"
|
||
params:
|
||
extra=config["bowtie"]["extra"],
|
||
idx=config["bowtie"]["idx"],
|
||
tmpDir=config.get("tempDir", "tmp"),
|
||
message:
|
||
"Executing Bowtie2 with {threads} threads on the following sample: {wildcards.sample}"
|
||
threads: max(1, config["bowtie"]["alignthreads"])
|
||
envmodules:
|
||
"nodes/bowtie2-2.4.2",
|
||
resources:
|
||
mem="2gb",
|
||
time_min="00:05:00",
|
||
shell:
|
||
"""
|
||
bowtie2 -x {params.idx} -U {input.fq} -p {threads} {params.extra} \
|
||
2> {log.bwtout} > {output}
|
||
"""
|
||
|
||
|
||
rule samtoolsSort:
|
||
input:
|
||
"result/bowtie/{sample}.sam",
|
||
output:
|
||
report(
|
||
protected("result/bowtie/{sample}.bam"),
|
||
caption="report/bowtie2.rst",
|
||
category="Step 2 Bowtie2",
|
||
),
|
||
# not a good idea because bam files are heavy
|
||
log:
|
||
"result/logs/samtools/{sample}.txt",
|
||
benchmark:
|
||
"result/benchmark/{sample}_samtoolsSort.tsv"
|
||
envmodules:
|
||
"nodes/samtools-1.11",
|
||
threads: 1
|
||
resources:
|
||
mem="2gb",
|
||
time_min="00:05:00",
|
||
params:
|
||
tmpDir=config.get("tempDir", "tmp"),
|
||
shell:
|
||
"""
|
||
samtools sort -O 'bam' -T {params.tmpDir} {input} > {output}
|
||
"""
|
||
|
||
|
||
rule samtoolsIndx:
|
||
input:
|
||
bam="result/bowtie/{sample}.bam",
|
||
output:
|
||
bai="result/bowtie/{sample}.bam.bai",
|
||
log:
|
||
"result/logs/bowtie/{sample}_index.txt",
|
||
threads: 1
|
||
resources:
|
||
mem="1gb",
|
||
time_min="00:05:00",
|
||
envmodules:
|
||
"nodes/samtools-1.11",
|
||
shell:
|
||
"""
|
||
samtools index -b {input.bam} &> {log}
|
||
"""
|
||
|
||
|
||
rule featureCounts:
|
||
input:
|
||
bam="result/bowtie/{sample}.bam",
|
||
annot=config["references"]["annot"],
|
||
output:
|
||
report(
|
||
ensure("result/featureCounts/{sample}_ftc.txt", non_empty=True),
|
||
caption="report/featureCounts.rst",
|
||
category="Step 3 featureCounts",
|
||
),
|
||
log:
|
||
"result/logs/featureCounts/{sample}_ftc.txt",
|
||
threads: 1
|
||
resources:
|
||
mem="1gb",
|
||
time_min="00:10:00",
|
||
envmodules:
|
||
"subread/subread-1.5.2",
|
||
params:
|
||
extra=config["featureCounts"]["extra"],
|
||
shell:
|
||
"""
|
||
featureCounts {params.extra} -a {input.annot} -o {output} \
|
||
{input.bam} &> {log}
|
||
"""
|
||
|
||
|
||
rule extractCounts:
|
||
input:
|
||
"result/featureCounts/{sample}_ftc.txt",
|
||
output:
|
||
temp("result/featureCounts/{sample}_ftc{colExtra}.txt"),
|
||
log:
|
||
"result/logs/featureCounts/{sample}_extractCounts{colExtra}.txt",
|
||
threads: 1
|
||
resources:
|
||
mem="1gb",
|
||
time_min="00:05:00",
|
||
shell:
|
||
"""
|
||
cut -f {wildcards.colExtra} {input} | sed 1d > {output} 2> {log}
|
||
"""
|
||
|
||
|
||
rule matrixCounts:
|
||
input:
|
||
countfile=expand(
|
||
"result/featureCounts/{sample}_ftc{colExtra}.txt",
|
||
sample=config["samples"],
|
||
colExtra="7",
|
||
),
|
||
geneID=expand(
|
||
"result/featureCounts/{sample}_ftc{colExtra}.txt",
|
||
sample=config["samples"],
|
||
colExtra="1",
|
||
),
|
||
output:
|
||
"result/featureCounts/counts_matrix.txt",
|
||
log:
|
||
"result/logs/featureCounts/counts_matrix.txt",
|
||
threads: 1
|
||
resources:
|
||
mem="1gb",
|
||
time_min="00:05:00",
|
||
shell:
|
||
"""
|
||
paste {input.geneID[0]} {input.countfile} > {output} 2> {log}
|
||
"""
|
||
|
||
|
||
rule multiqc:
|
||
input:
|
||
expand(
|
||
"result/fastqc/{sample}_fastqc.zip",
|
||
sample=config["samples"],
|
||
allow_missing=True,
|
||
),
|
||
expand("result/fastqc/{sample}_fastqc.html", sample=config["samples"]),
|
||
expand("result/bowtie/{sample}.bam", sample=config["samples"]),
|
||
expand("result/featureCounts/{sample}_ftc.txt", sample=config["samples"]),
|
||
output:
|
||
"result/multiqc_report.html",
|
||
log:
|
||
report(
|
||
"result/logs/multiqc/multiqc.log",
|
||
caption="report/multiqc.rst",
|
||
category="Step 4 multiqc",
|
||
),
|
||
threads: 1
|
||
resources:
|
||
mem="2gb",
|
||
time_min="00:30:00",
|
||
params:
|
||
dir="result",
|
||
envmodules:
|
||
"nodes/multiqc-1.9",
|
||
shell:
|
||
"""
|
||
multiqc --outdir {params.dir} {params.dir} &> {log}
|
||
"""
|
exercise2_advanced/codes/profile/config.yaml | ||
---|---|---|
restart-times: 3
|
||
latency-wait: 180
|
||
software-deployment-method: env-modules
|
||
jobs: 16
|
||
keep-going: True
|
||
printshellcmds: True
|
exercise2_advanced/codes/report/alignPipeline.rst | ||
---|---|---|
**RNA-seq: Quality control and alignment**
|
||
|
||
# RNA-seq pipeline
|
||
|
||
## Step1: Fastqc
|
||
|
||
Check the sequence quality with Fastqc_
|
||
|
||
.. _Fastqc: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/
|
||
|
||
*Params:* default parameters
|
||
|
||
*Version:* fastqc/fastqc_v0.11.5
|
||
|
||
*Cite:* Andrews S. (2010). FastQC: a quality control tool for high throughput sequence data. Available online at: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/
|
||
|
||
|
||
## Step2: Bowtie2 and Samtools
|
||
|
||
Using Bowtie2_ for alignment on a reference genome. Samtools_ allowed to filter, index and convert format
|
||
|
||
.. _Bowtie2: https://bowtie-bio.sourceforge.net/bowtie2/index.shtml
|
||
.. _Samtools: http://www.htslib.org/
|
||
|
||
*Params:* {{ snakemake.config["bowtie"]["extra"] }}
|
||
|
||
*Version:*
|
||
|
||
- Bowtie2: nodes/bowtie2-2.4.2
|
||
- Samtools: nodes/samtools-1.11
|
||
|
||
*Cite:*
|
||
|
||
- Bowtie2: Langmead B, Salzberg S. Fast gapped-read alignment with Bowtie 2. Nature Methods. 2012, 9:357-359.
|
||
- Samtools: Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R, and 1000 Genome Project Data Processing Subgroup, The Sequence alignment/map (SAM) format and SAMtools, Bioinformatics (2009) 25(16) 2078-9
|
||
|
||
|
||
## Step3: featureCounts
|
||
|
||
Using FeatureCounts_ to quantify the level of gene expression
|
||
|
||
.. _FeatureCounts: https://subread.sourceforge.net/featureCounts.html
|
||
|
||
*Params:* {{ snakemake.config["featureCounts"]["extra"] }}
|
||
|
||
*Version:* subread/subread-1.5.2
|
||
|
||
*Cite:* Liao Y, Smyth GK and Shi W (2014). featureCounts: an efficient general purpose program for assigning sequence reads to genomic features. Bioinformatics, 30(7):923-30.
|
||
|
||
|
||
## Step4: Multiqc
|
||
|
||
With Multiqc_ combine all results in html (easy-to-use)
|
||
|
||
.. _Multiqc: https://multiqc.info/
|
||
|
||
*Params:* default parameters
|
||
|
||
*Version:* nodes/multiqc-1.9
|
||
|
||
|
||
*Cite:* MultiQC: Summarize analysis results for multiple tools and samples in a single report Philip Ewels, Måns Magnusson, Sverker Lundin and Max Käller. Bioinformatics (2016)
|
||
|
||
|
||
# Pipeline
|
||
|
||
- Snakemake_ : Mölder, F., Jablonski, K.P., Letcher, B., Hall, M.B., Tomkins-Tinch, C.H., Sochat, V., Forster, J., Lee, S., Twardziok, S.O., Kanitz, A., Wilm, A., Holtgrewe, M., Rahmann, S., Nahnsen, S., Köster, J., 2021. Sustainable data analysis with Snakemake. F1000Res 10, 33.
|
||
- Conda_
|
||
|
||
.. _Snakemake: https://snakemake.readthedocs.io/en/stable/
|
||
.. _Conda: https://docs.conda.io/en/latest/
|
exercise2_advanced/codes/report/bowtie2.rst | ||
---|---|---|
Bowtie2 and samtools
|
exercise2_advanced/codes/report/fastqc.rst | ||
---|---|---|
Running fastqc with default params
|
exercise2_advanced/codes/report/featureCounts.rst | ||
---|---|---|
FeatureCounts from Subread
|
exercise2_advanced/codes/report/multiqc.rst | ||
---|---|---|
Runiing with default params
|
exercise2_advanced/configfile.yaml | ||
---|---|---|
samples:
|
||
SRR3099585_chr18: Data/SRR3099585_chr18.fastq.gz
|
||
SRR3099586_chr18: Data/SRR3099586_chr18.fastq.gz
|
||
SRR3099587_chr18: Data/SRR3099587_chr18.fastq.gz
|
||
SRR3105697_chr18: Data/SRR3105697_chr18.fastq.gz
|
||
SRR3105698_chr18: Data/SRR3105698_chr18.fastq.gz
|
||
SRR3105699_chr18: Data/SRR3105699_chr18.fastq.gz
|
||
references:
|
||
annot: "Data/O.tauri_annotation.gff"
|
||
fasta: "Data/O.tauri_genome.fna"
|
||
bowtie:
|
||
idx: "bowtie2index/Otauri"
|
||
idxthreads: 1
|
||
extra: ""
|
||
alignthreads: 8
|
||
featureCounts:
|
||
extra: '-t "gene" -g "ID" -s "2"'
|
||
tempDir: "/tmp"
|
exercise2_advanced/readme_runSnake.txt | ||
---|---|---|
# Snakemake pipeline presentation
|
||
|
||
## The code
|
||
|
||
The pipeline's `code` folder contains the following arborescence of files:
|
||
|
||
```text
|
||
├── Snakefile
|
||
├── profile
|
||
│ └── config.yaml
|
||
└── report
|
||
├── alignPipeline.rst
|
||
├── bowtie2.rst
|
||
├── fastqc.rst
|
||
├── featureCounts.rst
|
||
└── multiqc.rst
|
||
```
|
||
|
||
The Snakefile is the file that contains all of Snakemake's rules. The configuration file `profile/config.yaml`
|
||
is a way of specifying options that you would otherwise add to the snakemake command line. The `report` folder
|
||
contains a set of hand-written report templates for each tool or method that was used and that will later
|
||
be incorporated into a final report file generated by Snakemake.
|
||
|
||
NB: this Snakefile was reformatted using the snakefmt tool (v0.10.0) in order to comply with the commonly
|
||
accepted (and encouraged) Snakemake formatting standards
|
||
|
||
## The required data (to download)
|
||
|
||
This pipeline runs on single-end bulk RNA-seq data and outputs quality reports on the samples provided as well as tables of
|
||
gene expression counts per sample.
|
||
|
||
Example input files can be found under (this link)[https://doi.org/10.5281/zenodo.8340293] (same as in Exercises 1A, 1B and 1C, see (this page)[https://bioi2.i2bc.paris-saclay.fr/training/snakemake/exercises]):
|
||
|
||
```text
|
||
Data/
|
||
├── O.tauri_annotation.gff
|
||
├── O.tauri_genome.fna
|
||
├── SRR3099585_chr18.fastq.gz
|
||
├── SRR3099586_chr18.fastq.gz
|
||
├── SRR3099587_chr18.fastq.gz
|
||
├── SRR3105697_chr18.fastq.gz
|
||
├── SRR3105698_chr18.fastq.gz
|
||
└── SRR3105699_chr18.fastq.gz
|
||
```
|
||
|
||
They contain 6 fastq files (chromosome 18 from SRR309958[5,6,7] and SRR310569[7,8,9]), as well as
|
||
the genome fasta of their reference species *O. tauri* and its annotation.
|
||
|
||
# Usage
|
||
|
||
**Requirements:**
|
||
- tools: snakemake, fastqc, bowtie, featureCounts, multiqc
|
||
- data: fastq files and their corresponding reference genome
|
||
|
||
**Before you run the pipeline:**
|
||
- edit the line `cd /path/to/the/project/` in the `runRnaseq.sh` file to your actual project folder e.g. `/path/to/snakemake_examples/exercise2_advanced/`
|
||
- edit the configuration file `configfile.yaml` so that the paths to the various samples and reference files are correct (a good habit is to use absolute paths) so that Snakemake can find them
|
||
|
||
For example, if my working directories look like this:
|
||
|
||
```text
|
||
/home/john.doe/snakemake_tutorial/Data/
|
||
├── O.tauri_annotation.gff
|
||
├── O.tauri_genome.fna
|
||
├── SRR3099585_chr18.fastq.gz
|
||
├── SRR3099586_chr18.fastq.gz
|
||
├── SRR3099587_chr18.fastq.gz
|
||
├── SRR3105697_chr18.fastq.gz
|
||
├── SRR3105698_chr18.fastq.gz
|
||
└── SRR3105699_chr18.fastq.gz
|
||
```
|
||
and
|
||
```text
|
||
/home/john.doe/snakemake_examples/exercise2_advanced/code/
|
||
├── Snakefile
|
||
├── profile
|
||
│ └── config.yaml
|
||
└── report
|
||
├── alignPipeline.rst
|
||
├── bowtie2.rst
|
||
├── fastqc.rst
|
||
├── featureCounts.rst
|
||
└── multiqc.rst
|
||
```
|
||
|
||
Then my configuration file should look like this (for example):
|
||
|
||
```text
|
||
samples:
|
||
SRR3099585_chr18: /home/john.doe/snakemake_tutorial/Data/SRR3099585_chr18.fastq.gz
|
||
SRR3099586_chr18: /home/john.doe/snakemake_tutorial/Data/SRR3099586_chr18.fastq.gz
|
||
SRR3099587_chr18: /home/john.doe/snakemake_tutorial/Data/SRR3099587_chr18.fastq.gz
|
||
SRR3105697_chr18: /home/john.doe/snakemake_tutorial/Data/SRR3105697_chr18.fastq.gz
|
||
SRR3105698_chr18: /home/john.doe/snakemake_tutorial/Data/SRR3105698_chr18.fastq.gz
|
||
SRR3105699_chr18: /home/john.doe/snakemake_tutorial/Data/SRR3105699_chr18.fastq.gz
|
||
references:
|
||
annot: "/home/john.doe/snakemake_tutorial/Data/O.tauri_annotation.gff"
|
||
fasta: "/home/john.doe/snakemake_tutorial/Data/O.tauri_genome.fna"
|
||
bowtie:
|
||
idx: "/home/john.doe/snakemake_examples/exercise2_advanced/result/bowtie2index/Otauri"
|
||
idxthreads: 1
|
||
extra: ""
|
||
alignthreads: 8
|
||
featureCounts:
|
||
extra: '-t "gene" -g "ID" -s "2"'
|
||
tempDir: "/tmp"
|
||
```
|
||
|
||
**Running the pipeline:**
|
||
|
||
Outputs will be generated in the current working directory (i.e. the directory in which you
|
||
are when you run the Snakemake command), within a `result` folder.
|
||
|
||
To use this Snakemake pipeline (tested with snakemake version 8.4.6) on the I2BC cluster you can use the little bash script provided:
|
||
|
||
```bash
|
||
qsub -V -l walltime=4:00:00 -l select=1:ncpus=1:mem=100mb runRnaseq.sh
|
||
```
|
||
|
||
If you have a look at the runRnaseq.sh script, you will see that we run snakemake twice: the first time to run the pipeline and
|
||
generate the output files; the second time to create the final report using the `--report myFinalReport.html` option.
|
||
|
||
|
||
# Output
|
||
|
||
```text
|
||
result/
|
||
├── benchmark
|
||
├── bowtie
|
||
├── fastqc
|
||
├── featureCounts
|
||
├── logs
|
||
├── multiqc_data
|
||
└── multiqc_report.html
|
||
```
|
||
|
||
|
||
# Les fonctions utilisées
|
||
|
||
## Fonctions générales
|
||
|
||
- envmodules : pour spécifier les environnements utilisés pour chaque règles (https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html#using-environment-modules)
|
||
- wildcard_constraints : permet une verification de la valeur des variables (https://snakemake.readthedocs.io/en/stable/tutorial/additional_features.html#constraining-wildcards)
|
||
|
||
## rule fastqc
|
||
|
||
- Utilisation d'une fonction au lieu d'un nom de fichier pour l'input (https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#input-functions)
|
||
- Utilisation de la fonction report pour créer un rapport après exécution du script (https://snakemake.readthedocs.io/en/stable/snakefiles/reporting.html#reports)
|
||
- multiext : variant de la fonction expand (https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#the-multiext-function)
|
||
|
||
## rule bowtieIndex
|
||
|
||
- lookup : https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#the-lookup-function
|
||
- ensure : vérification si le fichier est vide (https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#ensuring-output-file-properties-like-non-emptyness-or-checksum-compliance)
|
||
- multiext : variant de la fonction expand (https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#the-multiext-function)
|
||
|
||
## rule bowtie2 et samtoolsSort
|
||
|
||
- Utilisation d'une fonction au lieu d'un nom de fichier pour l'input (https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#input-functions)
|
||
- pipe : permet de remplacer l'utilisation de pipe et de scinder les différents éléments de la ligne de commande en plusieurs règles (https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#piped-output)
|
||
- config.get : permet de mettre une valeur par défaut si elle manque dans le fichier de configuration (NB: c'est une fonction de Python qui marche ici car config est un dictionnaire)
|
||
|
||
## rule extractCounts
|
||
|
||
- temp : suppression des fichiers une fois qu'ils ne sont plus utils (https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#protected-and-temporary-files)
|
||
|
||
## rule matrixCounts
|
||
|
||
- On peut utiliser autant de wildcards que l'on veut. Dans cette règle 2 wildcards sont utilisées (sample et colExtra).
|
exercise2_advanced/runRnaseq.sh | ||
---|---|---|
#!/bin/bash
|
||
|
||
module load snakemake/snakemake-8.4.6
|
||
|
||
cd /path/to/the/project/
|
||
|
||
# create directory for the pbs logs
|
||
mkdir pbs
|
||
|
||
snakemake --executor cluster-generic --cluster-generic-submit-cmd "qsub -V -l walltime={resources.time_min} -l select=1:ncpus={threads}:mem={resources.mem} -e pbs/{name}_$PBS_JOBID.err -o pbs/{name}_$PBS_JOBID.out" --snakefile codes/Snakefile --profile codes/profile --configfile configfile.yaml &> smkpipeline.txt
|
||
|
||
snakemake --executor cluster-generic --cluster-generic-submit-cmd "qsub -V -l walltime={resources.time_min} -l select=1:ncpus={threads}:mem={resources.mem}" --snakefile codes/Snakefile --profile codes/profile --configfile configfile.yaml --report smkRnaseq_report.html
|
Also available in: Unified diff
rename demo_advanced folder