commit 428015d24726790fca2251ac7cc7bfcd826b69b6
Author: Chloe Quignot <>
Date:   Tue Jun 25 11:26:21 2024 +0200

    add README and exercise 0 improved after 1C

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..0f2436c
--- /dev/null
+++ b/README.md
@@ -0,0 +1,43 @@
+# Examples and solutions of the Snakemake BIOI2 training session
+
+Instructions are on the BIOI2 website: https://bioi2.i2bc.paris-saclay.fr/training/snakemake/
+
+To download this repository, open a terminal and type:
+```bash
+git clone https://forge.i2bc.paris-saclay.fr/git/bioi2_formations/snakemake_examples.git
+```
+
+## Organisation of this repository
+
+```text
+├── README.md
+├── exercise0
+├── exercise0_improved_after_1A
+├── exercise0_improved_after_1B
+├── exercise0_improved_after_1C
+└── demo_advanced
+```
+
+### Exercise 0
+
+The example Snakemake pipeline to execute in [Exercise 0](https://bioi2.i2bc.paris-saclay.fr/training/snakemake/exercises/exercise-0-objective) is in the `exercise0` folder.
+
+**exercise0_improved_after_1X** folders are examples of improvements of the initial SnakeFile after applying what you've learnt in Exercises 1A, 1B and 1C. We advise you to have a look at them once you've finished with the forementioned exercises.
+
+### Exercise 2
+
+The `demo_advanced` folder is an example solution for Exercise 2 comprising several different types of syntaxes that you could encounter in Snakefiles. 
+
+
+## Executing the SnakeFiles
+
+If you're in the folder that contains the Snakefile (and if the Snakefile is named Snakefile), you can just type:
+
+```bash
+snakemake --cores 1
+```
+
+If you'd like to specify the Snakefile in the command line (because it's not in your current directory or because it's named differently):
+```bash
+snakemake --cores 1 -s /path/to/snakefile.smk
+```
diff --git a/exercise0_improved_after_1C/Snakefile b/exercise0_improved_after_1C/Snakefile
new file mode 100644
index 0000000..09a24a6
--- /dev/null
+++ b/exercise0_improved_after_1C/Snakefile
@@ -0,0 +1,83 @@
+import yaml
+
+with open('samples.yaml', 'r') as file
+   content = yaml.safe_load(file)
+samples = content['samples']
+
+
+rule targets:
+    input:
+        expand("fasta/{sample}.fasta", sample=samples),
+        "fusionFasta/allSequences.fasta",
+        "mafft/mafft_res.fasta",
+
+
+# Update 1: add the threads directive to all rules specifying
+#           the maximum number of threads/CPUs/processors to 
+#           use per rule
+# Update 2: add the resources directive to all rules specifying
+#           the maximum amount of memory, walltime etc. to 
+#           use per rule
+rule loadData:
+    output:
+        "fasta/{sample}.fasta",
+    params:
+        dirFasta = "fasta",
+    log:
+        stdout = "logs/{sample}_wget.stdout",
+        stderr = "logs/{sample}_wget.stderr",
+    threads: 1 
+    resources:
+        mem="1gb",
+        time_min="00:05:00",
+    shell:
+        """
+            wget --output-file {log.stderr} \
+               --directory-prefix {params.dirFasta} \
+               https://www.uniprot.org/uniprot/{wildcards.sample}.fasta > {log.stdout}
+        """
+
+
+rule fusionFasta:
+    input:
+        expand("fasta/{sample}.fasta", sample=samples),
+    output:
+        "fusionFasta/allSequences.fasta",
+    log:
+        "logs/fusionData.stderr",
+    threads: 1 
+    resources:
+        mem="1gb",
+        time_min="00:05:00",
+    shell:
+        """
+            cat {input} > {output} 2> {log}
+        """
+
+
+# Update 3: add the envmodules directive to rules that use 
+#           non-standard tools such as mafft so that Snakemake 
+#           automatically "activates" the tool on the cluster
+#           NB: use "module avail" to see the right syntax
+rule mafft:
+    input: 
+        "fusionFasta/allSequences.fasta",
+    output:
+        "mafft/mafft_res.fasta",
+    log:
+        "logs/whichMafft.txt",        
+    threads: 1 
+    resources:
+        mem="1gb",
+        time_min="00:05:00",
+    envmodules:
+        "nodes/mafft-7.475"
+    shell:
+        """
+            mafft {input} > {output} 2> {log}
+        """
+
+
+# Update 4: add a profile configuration file: profile/config.yaml
+#           to specify options instead of specifying them in the 
+#           command line at execution
diff --git a/exercise0_improved_after_1C/profile/config.yaml b/exercise0_improved_after_1C/profile/config.yaml
new file mode 100644
index 0000000..4f40d82
--- /dev/null
+++ b/exercise0_improved_after_1C/profile/config.yaml
@@ -0,0 +1,17 @@
+# cluster-specific options (for PBSpro environment):
+jobs: 6
+executor: cluster-generic
+cluster-generic-submit-cmd: "qsub -l ncpus={threads} -l mem={resources.mem} -l walltime={resources.time_min}"
+cluster-generic-cancel-cmd: "qdel"
+# set default resources for each job to 1 cpu and 1Gb if not specified otherwise:
+default-resources: [threads=1, mem="1Gb", time_min="02:00:00"]
+# software option:
+software-deployment-method: env-modules
+# to avoid typing -p everytime:
+printshellcmds: True
+# deactivate global stop when there's an error with 1 input:
+keep-going: True
+# retry running 3 times if fail
+restart-times: 3
+# in case there is latency when jobs are run on the cluster, wait a while
+latency-wait: 180
\ No newline at end of file
diff --git a/exercise0_improved_after_1C/readme_runSnake.txt b/exercise0_improved_after_1C/readme_runSnake.txt
new file mode 100644
index 0000000..f2a84f3
--- /dev/null
+++ b/exercise0_improved_after_1C/readme_runSnake.txt
@@ -0,0 +1,8 @@
+Pour faire fonctionner le pipeline il faut se connecter sur un noeud du cluster puis:
+
+- charger l'environnement snakemake:
+module load snakemake/snakemake-8.4.6
+module load nodes/mafft-7.475
+
+- executer le programme, se placer dans ce dossier et:
+snakemake --cores 1 
diff --git a/exercise0_improved_after_1C/samples.yaml b/exercise0_improved_after_1C/samples.yaml
new file mode 100644
index 0000000..536674d
--- /dev/null
+++ b/exercise0_improved_after_1C/samples.yaml
@@ -0,0 +1 @@
+samples: ["P01325", "P01308"]
\ No newline at end of file