1 Installation

Install the package from Bioconductor.

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install("RcwlPipelines")

The development version is also available to download from Github.

BiocManager::install("hubentu/RcwlPipelines")

Load the package into the R session.

library(RcwlPipelines)
library(dplyr)

2 Tools and pipelines scripts

The R scripts to build the CWL tools and pipelines are collected in a github repository now (https://github.com/hubentu/RcwlRecipes), which is community effort to collect Bioinformatics tools and pipelines using Rcwl and CWL (Common Workflow Language).

Three functions are used to collect the Rcwl scripts, search tools recipes by keywords and load the scripts to current R environment.

2.1 Indexing recipe scripts

The cwlUpdate function can update the recipe scripts from the github repository and collect meta data to a local cache by the BiocFileCache package. By default the local cache will be created under your home directory for the first time. Here we use temporary directory for example.

tools <- cwlUpdate(cachePath = tempfile())
#> Update scripts...
tools
#> class: BiocFileCache
#> bfccache: /tmp/RtmpFYhwHG/filebfd67d5babd
#> bfccount: 130
#> For more information see: bfcinfo() or bfcquery()

2.2 Search by keyword

The function cwlSearch can help to search indexed recipes by keywords. For example, here we try to find the alignment tool bwa mem.

tl <- cwlSearch(c("bwa", "mem"), tools)
data.frame(tl)
#>     rid  rname         create_time         access_time
#> 1 BFC82 tl_bwa 2021-03-06 02:40:36 2021-03-06 02:40:36
#>                                                               rpath rtype
#> 1 /tmp/RtmpFYhwHG/filebfd67d5babd/RcwlRecipes-rcwl1.6/Rcwl/tl_bwa.R local
#>                                                               fpath
#> 1 /tmp/RtmpFYhwHG/filebfd67d5babd/RcwlRecipes-rcwl1.6/Rcwl/tl_bwa.R
#>   last_modified_time etag expires Type Command
#> 1                 NA <NA>      NA tool bwa mem
#>                             Container
#> 1 biocontainers/bwa:v0.7.17-3-deb_cv1

2.3 Loading tools and pipelines

The function cwlLoad can be used to “install” to tools or pipelines to current environment by given the script path.

bwa <- cwlLoad(tl$rpath)
#> bwa loaded
bwa
#> class: cwlParam 
#>  cwlClass: CommandLineTool 
#>  cwlVersion: v1.0 
#>  baseCommand: bwa mem 
#> requirements:
#> - class: DockerRequirement
#>   dockerPull: biocontainers/bwa:v0.7.17-3-deb_cv1
#> inputs:
#>   threads (int): -t 
#>   RG (string): -R 
#>   Ref (File):  
#>   FQ1 (File):  
#>   FQ2 (File?):  
#> outputs:
#> sam:
#>   type: File
#>   outputBinding:
#>     glob: '*.sam'
#> stdout: bwaOutput.sam

Or we can install the tools by its rname directly.

bwa <- cwlLoad(rname = 'tl_bwa', bfc = tools)
#> bwa loaded

That’s it! The tool “bwa” is ready to use.

3 Build a pipeline

We can develop a pipline by utilizing the available tools. For example, a simple alignment pipelines with mapping and marking duplicates can be built from the tools.

First, we check whether the required tools (bwa, samtools and picard markduplicates) are available.

tls <- cwlSearch("bwa|sam2bam|sortBam|samtools_index|markdup", tools) %>%
    filter(Type == "tool") %>%
    select(rname, rpath, Command, Container)
tls
#> # A tibble: 6 x 4
#>   rname      rpath                             Command      Container           
#>   <chr>      <chr>                             <chr>        <chr>               
#> 1 tl_bwa     /tmp/RtmpFYhwHG/filebfd67d5babd/… bwa mem      biocontainers/bwa:v…
#> 2 tl_bwa_in… /tmp/RtmpFYhwHG/filebfd67d5babd/… bwa index    biocontainers/bwa:v…
#> 3 tl_markdup /tmp/RtmpFYhwHG/filebfd67d5babd/… picard Mark… quay.io/biocontaine…
#> 4 tl_sam2bam /tmp/RtmpFYhwHG/filebfd67d5babd/… samtools vi… biocontainers/samto…
#> 5 tl_samtoo… /tmp/RtmpFYhwHG/filebfd67d5babd/… samtools in… biocontainers/samto…
#> 6 tl_sortBam /tmp/RtmpFYhwHG/filebfd67d5babd/… samtools so… biocontainers/samto…

To load all the tools.

invisible(sapply(tls$rpath, cwlLoad))
#> bwa loaded
#> bwa_index loaded
#> markdup loaded
#> sam2bam loaded
#> samtools_index loaded
#> sortBam loaded

Next, we define the input parameters.

p1 <- InputParam(id = "threads", type = "int")
p2 <- InputParam(id = "RG", type = "string")
p3 <- InputParam(id = "Ref", type = "string")
p4 <- InputParam(id = "FQ1", type = "File")
p5 <- InputParam(id = "FQ2", type = "File?")

Then we define the pipeline steps, from raw fastqs to duplicates marked alignments.

## bwa
s1 <- Step(id = "bwa", run = bwa,
           In = list(threads = "threads",
                     RG = "RG",
                     Ref = "Ref",
                     FQ1 = "FQ1",
                     FQ2 = "FQ2"))
## sam to bam
s2 <- Step(id = "sam2bam", run = sam2bam,
           In = list(sam = "bwa/sam"))
## sort bam
s3 <- Step(id = "sortBam", run = sortBam,
           In = list(bam = "sam2bam/bam"))
## mark duplicates
s4 <- Step(id = "markdup", run = markdup,
           In = list(ibam = "sortBam/sbam",
                     obam = list(
                         valueFrom="$(inputs.ibam.nameroot).mdup.bam"),
                     matrix = list(
                         valueFrom="$(inputs.ibam.nameroot).markdup.txt")))
## index bam
s5 <- Step(id = "idxBam", run = samtools_index,
           In = list(bam = "markdup/mBam"))

Last, we define the outputs and connect the steps to a new pipeline.

req1 <- list(class = "StepInputExpressionRequirement")
req2 <- list(class = "InlineJavascriptRequirement")
## outputs
o1 <- OutputParam(id = "Bam", type = "File", outputSource = "markdup/mBam")
o2 <- OutputParam(id = "Idx", type = "File", outputSource = "idxBam/idx")
## stepParam
Align <- cwlStepParam(requirements = list(req1, req2),
                      inputs = InputParamList(p1, p2, p3, p4, p5),
                      outputs = OutputParamList(o1, o2))
## build pipeline
Align <- Align + s1 + s2 + s3 + s4 + s5

The pipeline is ready for use. We can plot the pipeline with plotCWL from the Rcwl package.

plotCWL(Align)
#> Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
#> Using compatibility `.name_repair`.

4 Pipelines summary

There are mainly 4 pipelines are collected in this package. Here is a brief introduction to these pipelines. More pipelines and tools are expected to be included in the future.

4.1 DNASeq alignment pipeline

The pipeline can be used to preprocess DNA sequences in fastq format. It can take paired fastqs, read groups from multiple batches as input.

alignMerge <- cwlLoad(rname = "pl_alignMerge", bfc = tools)
#> bwa loaded
#> sam2bam loaded
#> sortBam loaded
#> samtools_index loaded
#> bwaAlign loaded
#> mergeBam loaded
#> markdup loaded
#> samtools_index loaded
#> samtools_flagstat loaded
#> mergeBamDup loaded
#> alignMerge loaded
inputs(alignMerge)
#> inputs:
#>   idBam (string):  
#>   RG (string[]):  
#>   threads (int):  
#>   Ref (File):  
#>   FQ1s (File[]):  
#>   FQ2s (File[]):

The pipeline includes two steps and several jobs will be run in each step.

bwaAlign: bwa alignment by read groups.

runs(runs(alignMerge)[[1]])
#> List of length 4
#> names(4): bwa sam2bam sortBam idxBam

bwa: To align fastqs and read groups to reference genome with bwa.
sam2bam: To convert the alignments in “sam” format to “bam” format with samtools.
sortBam: To sort the “bam” file by coordinates with samtools.
idxBam: To index “bam” file with samtools.

mergeBamDup: Merge by samples and markduplicates.

runs(runs(alignMerge)[[2]])
#> List of length 4
#> names(4): mergeBam markdup samtools_index samtools_flagstat

mergeBam: To merge bam files from multiple batches with picard.
markdup: To mark duplicates with picard.
samtools_index: To index bam file with samtools.
samtools_flagstat: To summarize flags in bam with samtools.

The final bam files after duplicates marked, bam index, duplicates matrix, and flag statistics summary will be in the output folder.

outputs(alignMerge)
#> outputs:
#> oBam:
#>   type: File
#>   outputSource: mergeBamDup/oBam
#> matrix:
#>   type: File
#>   outputSource: mergeBamDup/matrix
#> Idx:
#>   type: File
#>   outputSource: mergeBamDup/Idx
#> stat:
#>   type: File
#>   outputSource: mergeBamDup/stat

Here you can find an example to run the pipeline.

https://hubentu.github.io/others/Rcwl/application.html#dnaseq-alignment-pipeline

4.2 RNASeq pipeline

The pipeline was built with reads quality summary, STAR alignment, quantification by featureCounts and RSeQC quality control. Here are the inputs.

ranseq_Sf <- cwlLoad(rname = "pl_rnaseq_Sf", bfc = tools)
#> fastqc loaded
#> STAR loaded
#> sortBam loaded
#> samtools_index loaded
#> samtools_flagstat loaded
#> featureCounts loaded
#> gtfToGenePred loaded
#> genePredToBed loaded
#> read_distribution loaded
#> geneBody_coverage loaded
#> RSeQC loaded
#> gtfToGenePred loaded
#> genePredToBed loaded
#> read_distribution loaded
#> geneBody_coverage loaded
#> STAR loaded
#> gCoverage loaded
#> rnaseq_Sf loaded
inputs(rnaseq_Sf)
#> inputs:
#>   in_seqfiles (File[]):  
#>   in_prefix (string):  
#>   in_genomeDir (Directory):  
#>   in_GTFfile (File):  
#>   in_runThreadN (int):  1

The pipeline includes 6 steps.

fastqc: To run quality summary for raw fastqs with fastqc.
STAR: To align fastqs with STAR.
samtools_index: To index aligned bam file.
samtools_flagstat: To summary alignment flags.
featureCounts: To quantify gene abundances.
RSeQC: Several steps included.
- gtfToGenePred: To convert GTF annotation to “genePred” format.
- genePredToBed: To convert “genePred” annotation to “bed” format.
- r_distribution: To run reads distribution over genome features.
- gCoverage: To summarize read coverage over gene body.

The outputs and logs from alignment, quantification and QC steps are collected together to the output folder. A final QC report could be generated by multiqc, which is also available in the data package.

An example to run the pipeline.

https://hubentu.github.io/others/Rcwl/application.html#rnaseq-pipeline

4.3 GATK4 germline variant calling pipeline

The GATK4 best practice pipeline for germline variant calling was implemented with Workflow Description Language (WDL). We wrapped the WDL pipeline into 3 steps with Rcwl. The details of the pipeline can be find here: https://software.broadinstitute.org/gatk/best-practices/workflow?id=11145

GAlign GATK alignment.

The fastqs, sample information and customized json files for WDL are required as inputs. Multiple steps will run in this step, including bwa alignment, mark duplicates and base quality recalibration. GATK ready BAM files will be collected to the output directory.

hapCall HaplotypeCaller.

The GATK ready BAM and customized json files are inputs in this step. The local paths of GATK bundle files are required to be modified in your json file. A “gVCF” files will be generated.

jdCall Joint variant discovery

This step will combine the “gVCF” files and then call germline variants in all samples. The paths of the local bundle files are also required to be changed in the json template file. The final VCF file of germline variants will be collected.

An example to run the pipeline.
https://hubentu.github.io/others/Rcwl/application.html#gatk4-germline-variant-calling-pipeline

4.4 GATK4 Somatic short variant pipeline

The GATK4 Mutect2 pipeline for germline variant calling was also available in WDL. The pipeline was reimplemented with Rcwl based on the best practice documents. https://software.broadinstitute.org/gatk/best-practices/workflow?id=11146

GPoN <- cwlLoad(rname = "pl_GPoN", bfc = tools)
#> GenomicsDB loaded
#> PoN loaded
#> GPoN loaded
Mutect2PL <- cwlLoad(rname = "pl_Mutect2PL", bfc = tools)
#> Mutect2 loaded
#> GetPileupSummaries loaded
#> CalculateContamination loaded
#> FilterMutectCalls loaded
#> ColSeqArtifact loaded
#> FilterOBias loaded
#> bcfview loaded
#> Mutect2PL loaded

Variant calling on normal samples

First, we need to run Mutect2 in tumor-only mode for each normal sample by the tool Mutect2. The argument “–max-mnp-distance 0” is required to be added because the next step, “GenpmicsDBImport”, can’t handle MNPs.

arguments(Mutect2) <- list("--max-mnp-distance", "0")
Mutect2
#> class: cwlParam 
#>  cwlClass: CommandLineTool 
#>  cwlVersion: v1.0 
#>  baseCommand: gatk Mutect2 
#> requirements:
#> - class: DockerRequirement
#>   dockerPull: broadinstitute/gatk:4.1.3.0
#> arguments: --max-mnp-distance 0 
#> inputs:
#>   tbam (File): -I 
#>   nbam (File?): -I 
#>   Ref (File): -R 
#>   normal (string?): -normal 
#>   germline (File?): --germline-resource 
#>   pon (File?): --panel-of-normals 
#>   interval (File?): -L 
#>   out (string): -O 
#> outputs:
#> vout:
#>   type: File
#>   secondaryFiles:
#>   - .idx
#>   - .stats
#>   outputBinding:
#>     glob: $(inputs.out)

Panel of normals

This step is to create a GenomicsDB and then combine to a VCF output for the panel of normals from all the normal Mutect2 calls. A cwl pipeline GPoN was built to create the panel VCF.

runs(GPoN)
#> List of length 2
#> names(2): GenomicsDB PoN

Mutect2 and variant filtering

This pipeline includes two main steps. First we call a large set of candidate somatic variants, then filter them by estimated contamination and orientation bias artifacts. We can plot the Mutect2PL pipeline to show the details.

plotCWL(Mutect2PL)

5 SessionInfo

sessionInfo()
#> R version 4.0.4 (2021-02-15)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 18.04.5 LTS
#> 
#> Matrix products: default
#> BLAS:   /home/biocbuild/bbs-3.12-bioc/R/lib/libRblas.so
#> LAPACK: /home/biocbuild/bbs-3.12-bioc/R/lib/libRlapack.so
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=C              
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> attached base packages:
#> [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
#> [8] methods   base     
#> 
#> other attached packages:
#> [1] dplyr_1.0.5          RcwlPipelines_1.6.2  BiocFileCache_1.14.0
#> [4] dbplyr_2.1.0         Rcwl_1.6.0           S4Vectors_0.28.1    
#> [7] BiocGenerics_0.36.0  yaml_2.2.1           BiocStyle_2.18.1    
#> 
#> loaded via a namespace (and not attached):
#>  [1] httr_1.4.2          tidyr_1.1.3         sass_0.3.1         
#>  [4] bit64_4.0.5         jsonlite_1.7.2      R.utils_2.10.1     
#>  [7] bslib_0.2.4         shiny_1.6.0         assertthat_0.2.1   
#> [10] BiocManager_1.30.10 base64url_1.4       blob_1.2.1         
#> [13] progress_1.2.2      pillar_1.5.1        RSQLite_2.2.3      
#> [16] backports_1.2.1     glue_1.4.2          digest_0.6.27      
#> [19] RColorBrewer_1.1-2  promises_1.2.0.1    checkmate_2.0.0    
#> [22] htmltools_0.5.1.1   httpuv_1.5.5        R.oo_1.24.0        
#> [25] pkgconfig_2.0.3     bookdown_0.21       DiagrammeR_1.0.6.1 
#> [28] purrr_0.3.4         xtable_1.8-4        brew_1.0-6         
#> [31] later_1.1.0.1       BiocParallel_1.24.1 tibble_3.1.0       
#> [34] generics_0.1.0      ellipsis_0.3.1      cachem_1.0.4       
#> [37] withr_2.4.1         cli_2.3.1           magrittr_2.0.1     
#> [40] crayon_1.4.1        mime_0.10           ps_1.6.0           
#> [43] memoise_2.0.0       evaluate_0.14       R.methodsS3_1.8.1  
#> [46] fansi_0.4.2         tools_4.0.4         data.table_1.14.0  
#> [49] prettyunits_1.1.1   hms_1.0.0           lifecycle_1.0.0    
#> [52] stringr_1.4.0       compiler_4.0.4      jquerylib_0.1.3    
#> [55] rlang_0.4.10        debugme_1.1.0       rstudioapi_0.13    
#> [58] rappdirs_0.3.3      htmlwidgets_1.5.3   visNetwork_2.0.9   
#> [61] igraph_1.2.6        rmarkdown_2.7       codetools_0.2-18   
#> [64] DBI_1.1.1           curl_4.3            R6_2.5.0           
#> [67] knitr_1.31          fastmap_1.1.0       bit_4.0.4          
#> [70] utf8_1.1.4          stringi_1.5.3       Rcpp_1.0.6         
#> [73] vctrs_0.3.6         batchtools_0.9.15   tidyselect_1.1.0   
#> [76] xfun_0.21

Rcwl Pipelines

2021-03-05