From 3fce863d69a2860cf594d0ab50c85dfc372bd145 Mon Sep 17 00:00:00 2001 From: jhudsl-robot Date: Thu, 10 Aug 2023 16:00:09 +0000 Subject: [PATCH] Template cleanup --- .github/sync.yml | 114 ----- .github/workflows/send-updates.yml | 45 -- .github/workflows/starting-course.yml | 138 ------ .github/workflows/test-send-updates.yml | 40 -- docs/404.html | 214 --------- docs/index.html | 216 --------- docs/index.md | 9 - docs/running-a-workflow-in-terra.html | 571 ------------------------ docs/terra_walkthrough.md | 296 ------------ 9 files changed, 1643 deletions(-) delete mode 100644 .github/sync.yml delete mode 100644 .github/workflows/send-updates.yml delete mode 100644 .github/workflows/starting-course.yml delete mode 100644 .github/workflows/test-send-updates.yml delete mode 100644 docs/404.html delete mode 100644 docs/index.html delete mode 100644 docs/index.md delete mode 100644 docs/running-a-workflow-in-terra.html delete mode 100644 docs/terra_walkthrough.md diff --git a/.github/sync.yml b/.github/sync.yml deleted file mode 100644 index d0f474c..0000000 --- a/.github/sync.yml +++ /dev/null @@ -1,114 +0,0 @@ -# Candace Savonen Aug 2021 -# For info on how to update this file see: https://github.com/marketplace/actions/repo-file-sync-action#%EF%B8%8F-sync-configuration - -group: - - files: - - source: .github/workflows/ - dest: .github/workflows/ - deleteOrphaned: true - exclude: | - send-updates.yml - test-send-updates.yml - starting-course.yml - release-notes.yml - docker-test.yml - docker-build.yml - - source: scripts/ - dest: scripts/ - deleteOrphaned: true - - source: .github/ISSUE_TEMPLATE/course-problem-report.md - dest: .github/ISSUE_TEMPLATE/course-problem-report.md - - source: .github/ISSUE_TEMPLATE/course-content-add.md - dest: .github/ISSUE_TEMPLATE/course-content-add.md - - source: style-sets - dest: style-sets - - source: assets/box_images/ - dest: assets/box_images/ - # Repositories to receive changes - repos: | - jhudsl/Documentation_and_Usability - jhudsl/Informatics_Research_Leadership - jhudsl/Data_Management_for_Cancer_Research - jhudsl/Computing_for_Cancer_Informatics - jhudsl/Adv_Reproducibility_in_Cancer_Informatics - jhudsl/Reproducibility_in_Cancer_Informatics - jhudsl/Dissemination_and_Engagement - jhudsl/Cancer_Clinical_Informatics - jhudsl/Machine_Learning_for_Cancer_Informatics - jhudsl/Cancer_Informatics_Data_Visualization - jhudsl/Cancer_Genome_Informatics - jhudsl/Cancer-Imaging-Informatics - datatrail-jhu/DataTrail_Template - abyzovlab/CNVpytor-course - opencasestudies/OCS_Guide - tmm211/dsp-terra-mooc-test2 - fhdsl/Choosing_Genomics_Tools - fhdsl/NIH_Data_Sharing - fhdsl/FH_Cluster_Guide - fhdsl/Using_Leanpub - fhdsl/Ethical_Data_Handling_for_Cancer_Research - fhdsl/Overleaf_and_LaTeX_for_Scientific_Articles - fhdsl/Design_of_Data_Analysis - fhdsl/Tools_for_Reproducible_Workflows_in_R - fhdsl/Introduction_to_reproducible_research - mccoy-lab/hgv_modules - griffithlab/pVACtools_Intro_Course - griffithlab/CIVIC_SVI_Course - PracticalGenomics/cogaps-on-sciserver - fhdsl/AI_for_software - griffithlab/Immuno_Workflow_Course - -###ADD NEW REPO HERE following the format above# - -### These are custom groups for syncing -- not all files needs to be synced - - files: - - source: config_automation.yml - dest: config_automation.yml - - source: .github/workflows/pull-request.yml - dest: .github/workflows/pull-request.yml - repos: | - jhudsl/OTTR_Template_Website - - - files: - - source: config_automation.yml - dest: config_automation.yml - - source: .github/workflows/delete-preview.yml - dest: .github/workflows/delete-preview.yml - - source: .github/workflows/send-updates.yml - dest: .github/workflows/send-updates.yml - - source: .github/workflows/test-send-updates.yml - dest: .github/workflows/test-send-updates.yml - - source: scripts/make_screenshots.R - dest: scripts/make_screenshots.R - - source: scripts/switch_sync_repo.R - dest: scripts/switch_sync_repo.R - repos: | - jhudsl/OTTR_Quizzes - - - files: - - source: .github/workflows/ - dest: .github/workflows/ - exclude: | - starting-course.yml - release-notes.yml - - source: .github/switch_sync_repo.R - dest: .github/switch_sync_repo.R - - source: .github/ISSUE_TEMPLATE/course-problem-report.md - dest: .github/ISSUE_TEMPLATE/course-problem-report.md - - source: .github/ISSUE_TEMPLATE/course-content-add.md - dest: .github/ISSUE_TEMPLATE/course-content-add.md - - source: code_of_conduct.md - dest: code_of_conduct.md - - source: assets/box_images/ - dest: assets/box_images/ - - source: assets/style.css - dest: assets/style.css - - source: assets/toc_close.css - dest: assets/toc_close.css - - source: scripts/ - dest: scripts/ - deleteOrphaned: true - - source: style-sets/fhdasl/ - dest: style-sets/fhdasl/ - repos: | - jhudsl/AnVIL_Template diff --git a/.github/workflows/send-updates.yml b/.github/workflows/send-updates.yml deleted file mode 100644 index 3fda233..0000000 --- a/.github/workflows/send-updates.yml +++ /dev/null @@ -1,45 +0,0 @@ -# Candace Savonen Aug 2021 - -name: Sync Files - -on: - release: - types: - - published - workflow_dispatch: - inputs: - prtag: - description: 'Tag to use?' - required: true - default: 'null' - -jobs: - sync: - runs-on: ubuntu-latest - steps: - - name: Checkout Repository - uses: actions/checkout@v3 - - - name: Login as jhudsl-robot - run: | - git config --global --add safe.directory $GITHUB_WORKSPACE - git config --global user.email "itcrtrainingnetwork@gmail.com" - git config --global user.name "jhudsl-robot" - - - name: Get the version - id: get_tag - run: | - if [ github.event.inputs.prtag == 'null' ] - then - echo "version=$(echo $GITHUB_REF | cut -d / -f 3)" >> $GITHUB_OUTPUT - fi - if [ github.event.inputs.prtag != 'null' ] - then - echo "version=${{ github.event.inputs.prtag }}" >> $GITHUB_OUTPUT - fi - - - name: Run Mechanics File Sync - uses: BetaHuhn/repo-file-sync-action@v1.17.21 - with: - GH_PAT: ${{ secrets.GH_PAT }} - COMMIT_BODY: release-${{ steps.get_tag.outputs.version }} diff --git a/.github/workflows/starting-course.yml b/.github/workflows/starting-course.yml deleted file mode 100644 index d1683f2..0000000 --- a/.github/workflows/starting-course.yml +++ /dev/null @@ -1,138 +0,0 @@ -# Code adapted from https://github.com/JetBrains/intellij-platform-plugin-template/blob/deb171483598ee8a5d7621154db880e87b4db4ef/.github/workflows/template-cleanup.yml -# by Candace Savonen for this repository. - -name: Starting a new course - -on: - push: - branches: [ main, staging ] - workflow_dispatch: - -jobs: - # Run cleaning process only if workflow is triggered by not being in the Bookdown template anymore - template-cleanup: - name: Template Cleanup - runs-on: ubuntu-latest - if: github.event.repository.name != 'OTTR_Template' - steps: - - name: checkout repo - uses: actions/checkout@v3 - - - name: Login as jhudsl-robot - run: | - git config --global --add safe.directory $GITHUB_WORKSPACE - git config --global user.email "itcrtrainingnetwork@gmail.com" - git config --global user.name "jhudsl-robot" - -##### Delete Template-specific files that aren't needed for new courses - - # Cleanup Template-specific bits - - name: Cleanup - run: | - # Cleanup - rm -rf \ - .github/workflows/report-maker.yml \ - .github/workflows/send-updates.yml \ - .github/workflows/test-send-updates.yml \ - .github/sync.yml \ - .github/test-sync.yml \ - .github/workflows/starting-course.yml \ - .github/ISSUE_TEMPLATE/course-template-problem-report.md \ - .github/ISSUE_TEMPLATE/course-template-feature-request.md \ - resources/code_output \ - resources/screenshots \ - resources/course_screenshots \ - resources/gs_slides \ - resources/image_to_slide_key.tsv \ - resources/images/02-chapter_of_course_files \ - resources/images/03-test_cases_files \ - resources/images/04-figures_files \ - resources/chapt_screen_images \ - Course_Name.rds \ - docs/*.html \ - docs/*.md \ - manuscript/* - - - # Commit modified files - - name: Commit deleted files - id: commit_it - run: | - git config --global --add safe.directory $GITHUB_WORKSPACE - git config --global user.email "itcrtrainingnetwork@gmail.com" - git config --global user.name "jhudsl-robot" - git add . - git commit -m "Template cleanup" - pushed_it=true - git push || pushed_it=false - echo "pushed_it=$pushed_it" >> $GITHUB_OUTPUT - - # If main is already protected, then file a PR - - name: Create PR with deleted files - if: steps.commit_it.outputs.pushed_it == 'false' - uses: peter-evans/create-pull-request@v3 - id: pr - with: - commit-message: Delete unnecessary files - signoff: false - branch: auto_copy_rendered_files - delete-branch: true - title: 'Automatic course set up' - body: | - ### Description: - This PR was initiated by the github actions. It helps set up this repository to be ready to write your course. - It deletes some remnant files you don't need for your course but were used when this was a template. - labels: | - automated - reviewers: $GITHUB_ACTOR - token: ${{secrets.GH_PAT}} - -##### Filing issues! - - # Issue for what repository settings need to be set - - name: New Course - Set Repository Settings - uses: peter-evans/create-issue-from-file@v4 - with: - title: New Course - Set Repository Settings - content-filepath: .github/automatic-issues/set-repo-settings.md - labels: automated training issue - - # Issue for what needs to be edited - - name: New Course - Templates to Edit - uses: peter-evans/create-issue-from-file@v4 - with: - title: New Course - Templates to Edit - content-filepath: .github/automatic-issues/templates-to-edit.md - labels: automated training issue - - # Issue for how to enroll repo for updates - - name: New Course - Template Update Enrollment - uses: peter-evans/create-issue-from-file@v4 - with: - title: New Course - Template Update Enrollment - content-filepath: .github/automatic-issues/update-enrollment.md - labels: automated training issue - - # Issue for adding a method of feedback - - name: Reminder - Add a method of user feedback - uses: peter-evans/create-issue-from-file@v4 - with: - title: Reminder - Add user feedback method - content-filepath: .github/automatic-issues/add-feedback-method.md - labels: automated training issue - - - name: Get organization name - id: get_org_name - run: | - org_name=$(dirname ${{github.repository}}) - echo "org_name=$org_name" >> $GITHUB_OUTPUT - echo $org_name - - # Issue for adding the course to the jhudsl library - - name: Reminder - Add to jhudsl library - if: ${{ steps.get_org_name.outputs.org_name == 'jhudsl' }} - uses: peter-evans/create-issue-from-file@v4 - with: - title: Reminder - Add to jhudsl library - content-filepath: .github/automatic-issues/add-to-library.md - labels: automated training issue diff --git a/.github/workflows/test-send-updates.yml b/.github/workflows/test-send-updates.yml deleted file mode 100644 index ffb1afc..0000000 --- a/.github/workflows/test-send-updates.yml +++ /dev/null @@ -1,40 +0,0 @@ -# Candace Savonen Aug 2021 - -name: Test Sync Files - -on: - workflow_dispatch: - inputs: - repo: - description: 'What repo to test on e.g. jhudsl/OTTR_Template_Test' - required: true - default: 'jhudsl/OTTR_Template_Test' - -jobs: - test-sync: - runs-on: ubuntu-latest - container: - image: jhudsl/base_ottr:main - - steps: - - name: Checkout Repository - uses: actions/checkout@master - - - name: Login as jhudsl-robot - run: | - git config --global --add safe.directory $GITHUB_WORKSPACE - git config --global user.email "itcrtrainingnetwork@gmail.com" - git config --global user.name "jhudsl-robot" - - - name: Set up test-sync.yml - run: | - cp .github/sync.yml .github/test-sync.yml - # Switch out repo argument - Rscript --vanilla .github/switch_sync_repo.R --repo ${{ github.event.inputs.repo }} - - - name: Run Mechanics File Sync - uses: BetaHuhn/repo-file-sync-action@v1.17.21 - with: - GH_PAT: ${{ secrets.GH_PAT }} - COMMIT_BODY: "test-run" - CONFIG_PATH: .github/test-sync.yml diff --git a/docs/404.html b/docs/404.html deleted file mode 100644 index 190a2b7..0000000 --- a/docs/404.html +++ /dev/null @@ -1,214 +0,0 @@ - - - - - - - Page not found | Immuno Workflow Course – Terra Walkthrough - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - -
-
- - -
-
- -
- - - - - - - - - -
- -
-
-

Page not found

-

The page you requested cannot be found (perhaps it was moved or renamed).

-

You may want to try searching to find the page's new location, or use -the table of contents to find the page you are looking for.

-
-
-
- -
-
- -
-
-
- - -
-
- - - - - - - - - - - - - diff --git a/docs/index.html b/docs/index.html deleted file mode 100644 index b631a6e..0000000 --- a/docs/index.html +++ /dev/null @@ -1,216 +0,0 @@ - - - - - - - Immuno Workflow Course – Terra Walkthrough - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - -
-
- - -
-
- -
- - - - - - - - - -
- -
- -
-

Chapter 1 About

-

This course is an introduction to the immuno workflow. The only current page is the walkthrough for running on Terra.

- -
-
-
- -
-
- -
-
-
- - -
-
- - - - - - - - - - - - - diff --git a/docs/index.md b/docs/index.md deleted file mode 100644 index e3e2e5a..0000000 --- a/docs/index.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: "**Immuno Workflow Course**" -output: - html_document ---- - -# **About** - -This course is an introduction to the [immuno workflow](https://github.com/wustl-oncology/analysis-wdls). The only current page is the [walkthrough for running on Terra](running-a-workflow-in-terra.html). diff --git a/docs/running-a-workflow-in-terra.html b/docs/running-a-workflow-in-terra.html deleted file mode 100644 index 378552c..0000000 --- a/docs/running-a-workflow-in-terra.html +++ /dev/null @@ -1,571 +0,0 @@ - - - - - - - Chapter 2 Running a Workflow in Terra | Immuno Workflow Course – Terra Walkthrough - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - -
-
- - -
-
- -
- - - - - - - - - -
- -
-
-

Chapter 2 Running a Workflow in Terra

-
-

2.1 Prerequisites

-

To complete this walkthrough you’ll need to have access to a billing account in GCP. The cloud-workflows repo has more information on setting this up. Then, if you haven’t used Terra before, you’ll have to create a Terra billing project. Terra has detailed documentation on setting this up.

-
-
-

2.2 Workspace

-

All work in Terra happens in a workspace. Your workspaces are listed on the workspaces page. It also has a button to create a workspace–the plus to the right of the word Workspaces.

-
- -

Workspaces heading with + button

-
-

A unique and memorable name will make it easier to find later. This walkthrough will use “demonstrating-immuno-workflow-in-terra”. After choosing a name, a billing project must be selected.

-
- -

terra workspace form

-
-
-
-

2.3 Adding Data

-

The immuno workflow requires BAM or FASTQ files to be available as inputs. This walkthrough will use the HCC1395 data available here:

- -

These files will need to be extracted and uploaded to Terra. First extract all of these tar files to a sequence_data directory. Then, there are a few ways to upload that directory to Terra:

-
-

2.3.1 Uploading via Terra UI

-

On the “Data” tab for the workspace will be a “Files” link under “Other Data”. From that section there is an “Upload” button that can be used to upload the desired data.

-
- -

terra upload form

-
-
-
-

2.3.2 Uploading via Command Line

-

On the “Dashboard” tab for the workspace will be a link to the Google Cloud Storage Bucket for the workspace. By copying this bucket URL, the files can be uploaded using the gsutil tool.

-
- -

dashboard cloud information

-
-
-
-

2.3.3 A note about sharing data across workspaces.

-

If the same data will be used by many people, it may be beneficial to upload it to a workspace and configure it once. Thereafter that workspace can be cloned to use the data in another workspace without copying it again thereby saving storage costs. The data tables configured below could also be cloned to the new workspace.

-
-
-
-

2.4 Populating Terra Tables

-

For this example, there will be three data tables: “sequence” (to list the information for each pair of input FASTQs), “sample” (for any metadata about the samples), and “analysis” (to group together the sequences and samples for one workflow run). In the simplest case a workflow can be run from a single table (or by supplying an input JSON directly); the additional tables here are to demonstrate some of the linking features of Terra.

-
-

2.4.1 Adding the “sequence” table.

-

On the “Data” tab of the workspace is an “Import Data” button. Clicking on it offers a few options–in this case use the import TSV option.

-
- -

import menu

-
-

This offers the option to upload a file or to copy and paste the TSV. To create the “sequence” table, start with the following:

-
entity:sequence_id  sample_id   fastq1  fastq2  readgroup
-1   HCC1395_TUMOR_DNA   gs://BUCKET/sequence_data/Exome_Tumor/Exome_Tumor_R1.fastq.gz   gs://BUCKET/sequence_data/Exome_Tumor/Exome_Tumor_R2.fastq.gz   @RG\tID:1\tPU:C1TD1ACXX\tSM:HCC1395_TUMOR_DNA\tLB:HCC1395_TUMOR_DNA_LIB1\tPL:Illumina\tCN:WUGSC
-2   HCC1395_NORMAL_DNA  gs://BUCKET/sequence_data/Exome_Norm/Exome_Norm_R1.fastq.gz gs://BUCKET/sequence_data/Exome_Norm/Exome_Norm_R2.fastq.gz @RG\tID:2\tPU:C1TD1ACXX\tSM:HCC1395_NORMAL_DNA\tLB:HCC1395_NORMAL_DNA_LIB1\tPL:Illumina\tCN:WUGSC
-3   HCC1395_TUMOR_RNA   gs://BUCKET/sequence_data/RNAseq_Tumor/RNAseq_Tumor_Lane1_R1.fastq.gz   gs://BUCKET/sequence_data/RNAseq_Tumor/RNAseq_Tumor_Lane1_R2.fastq.gz   "ID:3   PU:H3MYFBBXX.4  SM:HCC1395_TUMOR_RNA    LB:HCC1395_TUMOR_RNA_LIB1   PL:Illumina CN:WUGSC"
-4   HCC1395_TUMOR_RNA   gs://BUCKET/sequence_data/RNAseq_Tumor/RNAseq_Tumor_Lane2_R1.fastq.gz   gs://BUCKET/sequence_data/RNAseq_Tumor/RNAseq_Tumor_Lane2_R2.fastq.gz   "ID:4   PU:H3MYFBBXX.5  SM:HCC1395_TUMOR_RNA    LB:HCC1395_TUMOR_RNA_LIB1   PL:Illumina CN:WUGSC"
-

The bucket paths need to be updated to point to the uploaded files for this workspace by filling in the BUCKET. Notice the headers of this file… the first column starts with entity: to indicate this column identifies the table. Since it says sequence_id the table will be named “sequence”. For this table, the file import must be used as the text uploader does not handle fields that contain tabs.

-
- -

import file

-
-
-
-

2.4.2 Adding the Sample table.

-

This time the text import can be used by copying this block:

-
entity:sample_id    individual_id   type
-HCC1395_TUMOR_DNA   HCC1395 tumor_dna
-HCC1395_NORMAL_DNA  HCC1395 normal_dna
-HCC1395_TUMOR_RNA   HCC1395 tumor_rna
-

This will create a “sample” table.

-
- -

import text

-
-
-
-

2.4.3 Adding the Analysis table.

-

As before, open the TSV import and copy this block:

-
entity:analysis_id  individual_id   tumor_dna_sample    normal_dna_sample   tumor_rna_sample
-HCC1395 HCC1395 {"entityType":"sample","entityName":"HCC1395_NORMAL_DNA"}   {"entityType":"sample","entityName":"HCC1395_TUMOR_DNA"}    {"entityType":"sample","entityName":"HCC1395_TUMOR_RNA"}
-

Notice this table has some entities in the columns. This will automatically link those entries to the “sample” table. This allows access to properties of the “sample” table via this table. If there were another table for individuals, the “sample” table could’ve been created with links to that table for further nested indirect access.

-
-
-

2.4.4 Linking to sequences in the analysis table.

-

Now that the basic tables are in place, there are a few more steps to do before using the workflow. First is to add columns to the analysis table to point to which sequence applies. From the “analysis” table, there is an edit button with an option to “Add Column”. Use it three times, one for each sample.

-
- -

edit analysis table

-
-

Each time, use a type of “reference” to the “sequence” table and select the option that the values are a list.

-

The three columns to add and their values:

- - - - - - - - - - - - - - - - - - - - - -
columnvalue
tumor_dna_sequences1
normal_dna_sequences2
tumor_rna_sequences3,4
-

The option to use a list is selected every time because the workflow expects a list of data, even if some lists only contain one item.

-
- -

add column form

-
-

As a note, although this walkthrough used the Terra UI to demonstrate adding a column to the workflow, it would’ve been possible to create these entities in the TSV directly and skip this step so long as the column is formatted correctly. An example of how to format a list in the TSV:

-
[{"entityType":"sequence","entityName":"1"},{"entityType":"sequence","entityName":"2"}]
-
-
-

2.4.5 Formatting a sequence entry in the sequence table.

-

The Terra interface does not support adding a JSON object directly as a single column unless that object is the output of a workflow. Therefore a separate zero-step WDL workflow is needed in order to format the sequence objects in the way the immuno.wdl expects. This is a great opportunity to see a simple case of setting up a workflow before moving on to the full immuno workflow.

-
-

2.4.5.1 Pulling the sequence formatter workflow from Dockstore

-

First, visit the dockstore page for the workflow: - -* sequence-object-consolidator

-

At the right side of this page is a “Launch with…” section.

-
- -

dockstore launch with section

-
-

Choosing Terra opens a Terra page to select a workspace.

-
- -

terra workflow import

-
-

Once the correct workspace is chosen Terra will open the Workflows tab. This is the main page for configuring the many options for a WDL workflow.

-
-
-

2.4.5.2 Selecting the workflow data.

-

In this case, the “Run workflow(s) with inputs defined by data table” is used to point to the data that was set up. Choose “sequence” for the table to run on.

-
- -

select table

-
-

Next there is a “SELECT DATA” button. In the dialogue, select all four of the sequence entities.

-
- -

select table data

-
-

This will automatically define a sequence set containing these four items. You can give it a custom name if desired or accept the default.

-
-
-

2.4.5.3 Setting the workflow options.

-

The call caching doesn’t matter for this workflow, so the default is fine. There are no intermediate outputs, but it’s a good idea to be in the habit of remembering to select the box to delete them, so go ahead and mark it. The other options can remain unselected–we’re not using reference data, this workflow has no steps, so there is nothing to retry, and we shouldn’t have any empty outputs possible.

-
- -

workflow options

-
-

These options reset every time, so be sure to check that they are configured appropriately before each run. Forgetting to delete intermediate outputs can be expensive over time!

-
-
-

2.4.5.4 Setting the workflow inputs.

-

The inputs to this workflow match the columns in the sequence table, so each input maps to the corresponding column by prepending it with this.:

- - - - - - - - - - - - - - - - - - - - - -
inputvalue
fastq1this.fastq1
fastq2this.fastq2
readgroupthis.readgroup
-
- -

workflow inputs

-
-

The workflow will run once per sequence entity selected above. For each run, this will refer to one row in the table and pull the appropriate value.

-
-
-

2.4.5.5 Setting the workflow outputs.

-

This workflow has only one output, so wire it up like the inputs.

- - - - - - - - - - - - - -
outputvalue
sequence_datathis.sequence_data
-
- -

workflow outputs

-
-

When the workflow runs this will automatically add a column to the sequence table named “sequence_data” with the output. As with the inputs, this happens once per row to fully populate the new column.

-
-
-

2.4.5.6 Launching the workflows.

-

Since four rows were selected from the sequence table, this will launch four workflows–one per row. A dialogue after clicking “RUN ANALYSIS” will confirm this number. And then it will launch!

-
- -

confirm launch

-
-
-
-

2.4.5.7 Seeing the results.

-

These workflows complete quickly so you should soon receive an e-mail with the success or failure status. Assuming they succeeded, returning to the Data tab and selecting the sequence table should show the additional column with the sequence data objects properly formatted for the immuno.wdl workflow.

-
- -

sequence data with new column

-
-
-
-
-
-

2.5 Setting up the immuno.wdl workflow.

-

Now that all the input data is ready, it’s time to do the configure the immuno.wdl workflow.

-
-

2.5.1 Pulling the immuno.wdl workflow from Dockstore

-

First visit the dockstore page for the workflow: - -* immuno.wdl

-

As before, choose the “Launch with” option for “Terra”.

-
-
-

2.5.2 Selecting the workflow data.

-

As before, choose “Run workflow(s) with inputs defined by data table”, but this time choose the “analysis” table. In the “SELECT DATA” dialogue choose the one and only row, for HCC1395.

-
-
-

2.5.3 Setting the workflow options.

-

Call caching can be turned on if desired–should the workflow fail this will allow it to shortcut previously successful steps. Once again it’s a good idea to select the “Delete intermediate outputs” option–though note that call caching won’t work once a run completes and the intermediate outputs have been deleted. The other options can remain unselected.

-
-
-

2.5.4 Setting the workflow inputs.

-
-

2.5.4.1 Static inputs

-

This time there are far more inputs to deal with. Thankfully, most of them are static values. If starting from scratch there is a link to download a JSON template that can be filled in and re-uploaded. An existing JSON from a manual run works, too. For this example, there’s an existing YAML that can be used. It will need to be converted to JSON for use in Terra. There are many ways to do this conversion; one is an online tool like this one. Using the “Upload JSON” we’ll use this to fill in most of the inputs.

-
-
-

2.5.4.2 Linking to the data table

-

Several inputs need link to the data table, so these will need to be filled in to replace the values from the uploaded JSON (if any were set).

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
inputvalue
normal_sample_namethis.normal_dna_sample.sample_id
normal_sequencethis.normal_dna_sequences.sequence_data
rna_sequencethis.tumor_rna_sequences.sequence_data
sample_namethis.tumor_rna_sample.sample_id
tumor_sample_namethis.tumor_dna_sample.sample_id
tumor_sequencethis.tumor_dna_sequences.sequence_data
-

This shows how when a table contains references to another one, you can refer to columns of the other tables through the relationships established.

-
-
-
-

2.5.5 Setting the workflow outputs.

-

Every output from the workflow can be added as a column to the datatable. Choose “Use defaults” to automatically populated all the outputs to columns of the same name.

-
-
-

2.5.6 Launching the workflow

-

Now we’re ready to “Launch Analysis”. This time, we only have one row so it will launch a single workflow to do the work. For this immuno.wdl, it will take a few days to run to completion on the HCC1395, so now we wait. Terra should send an e-mail once the workflow either succeeds or fails.

-
-
-

2.5.7 Viewing the results

-

After the run, there are several ways to examine the results.

-
-

2.5.7.1 In the data table

-

The outputs that were specified will be added as columns to the “analysis” datatable.

-
- -

filled in analysis data table

-
-
-
-

2.5.7.2 From the command-line

-

This table is now very wide! It may be useful to select the row and export it as a TSV to view.

-
- -

export to tsv

-
-

There is also a command-line tool to pull data from a terra workspace. To pull the analysis table:

-
pip install firecloud
-fissfc entity_tsv -p TERRA_BILLING_PROJECT -w WORKSPACE -t analysis -m flexible
-

For more about this command, see this Terra documentation page.

-
-
-

2.5.7.3 In the Job History

-

For a more comprehensive view of what happened in the workflow, the Job History view can be used.

-
- -

job history listings

-
-

This will have the attempts to run both the immuno and sequence-object-consolidator workflows. Clicking on the immuno workflow will load a details page for digging in.

-
- -

job history details

-
-

The “Job Manager” link on this page will show the inputs, outputs, and timings of each step in the workflow and the “Execution Directory” link will lead to the Google bucket where cromwell did its work. If intermediate outputs were not set to be deleted, they will also be in this location along with the final results.

-

For more on the job manager, see this Terra documentation page.

-
-
-
-
-

2.6 Further information

-

Terra maintains extensive documentation including some video walkthroughs of the basics of using various features of the platform.

- -
-
-
-
- -
-
- -
-
-
- - -
-
- - - - - - - - - - - - - diff --git a/docs/terra_walkthrough.md b/docs/terra_walkthrough.md deleted file mode 100644 index 3e16033..0000000 --- a/docs/terra_walkthrough.md +++ /dev/null @@ -1,296 +0,0 @@ ---- -title: "**Immuno Workflow Course -- Terra Walkthrough**" -output: - html_document: - toc: true ---- - -# Running a Workflow in Terra - -## Prerequisites - -To complete this walkthrough you'll need to have access to a billing account in GCP. The [cloud-workflows repo has more information on setting this up](https://github.com/wustl-oncology/cloud-workflows/blob/main/docs/getting_started_gcp.md). Then, if you haven't used Terra before, you'll have to create a Terra billing project. Terra has [detailed documentation on setting this up](https://support.terra.bio/hc/en-us/articles/360026182251). - -## Workspace - -All work in Terra happens in a workspace. Your workspaces are listed on the [workspaces page](https://app.terra.bio/#workspaces). It also has a button to create a workspace--the plus to the right of the word Workspaces. - -![Workspaces heading with + button](resources/images/terra_screens/workspaces-heading.png) - -A unique and memorable name will make it easier to find later. This walkthrough will use "demonstrating-immuno-workflow-in-terra". After choosing a name, a billing project must be selected. - -![terra workspace form](resources/images/terra_screens/workspace-create_form.png) - -## Adding Data - -The immuno workflow requires BAM or FASTQ files to be available as inputs. This walkthrough will use the HCC1395 data available here: - -* [HCC1395 Normal](http://genomedata.org/pmbio-workshop/fastqs/all/Exome_Norm.tar) -* [HCC1395 Tumor](http://genomedata.org/pmbio-workshop/fastqs/all/Exome_Tumor.tar) -* [HCC1395 Tumor RNA](http://genomedata.org/pmbio-workshop/fastqs/all/RNAseq_Tumor.tar) - -These files will need to be extracted and uploaded to Terra. First extract all of these tar files to a `sequence_data` directory. Then, there are a few ways to upload that directory to Terra: - -### Uploading via Terra UI - -On the "Data" tab for the workspace will be a "Files" link under "Other Data". From that section there is an "Upload" button that can be used to upload the desired data. - -![terra upload form](resources/images/terra_screens/data-tab_files.png) - -### Uploading via Command Line - -On the "Dashboard" tab for the workspace will be a link to the Google Cloud Storage Bucket for the workspace. By copying this bucket URL, the files can be uploaded using the `gsutil` tool. - -![dashboard cloud information](resources/images/terra_screens/dashboard-cloud_information.png) - -### A note about sharing data across workspaces. - -If the same data will be used by many people, it may be beneficial to upload it to a workspace and configure it once. Thereafter that workspace can be cloned to use the data in another workspace without copying it again thereby saving storage costs. The data tables configured below could also be cloned to the new workspace. - -## Populating Terra Tables - -For this example, there will be three data tables: "sequence" (to list the information for each pair of input FASTQs), "sample" (for any metadata about the samples), and "analysis" (to group together the sequences and samples for one workflow run). In the simplest case a workflow can be run from a single table (or by supplying an input JSON directly); the additional tables here are to demonstrate some of the linking features of Terra. - -### Adding the "sequence" table. - -On the "Data" tab of the workspace is an "Import Data" button. Clicking on it offers a few options--in this case use the import TSV option. - -![import menu](resources/images/terra_screens/data-upload_tsv.png) - -This offers the option to upload a file or to copy and paste the TSV. To create the "sequence" table, start with the following: - -``` -entity:sequence_id sample_id fastq1 fastq2 readgroup -1 HCC1395_TUMOR_DNA gs://BUCKET/sequence_data/Exome_Tumor/Exome_Tumor_R1.fastq.gz gs://BUCKET/sequence_data/Exome_Tumor/Exome_Tumor_R2.fastq.gz @RG\tID:1\tPU:C1TD1ACXX\tSM:HCC1395_TUMOR_DNA\tLB:HCC1395_TUMOR_DNA_LIB1\tPL:Illumina\tCN:WUGSC -2 HCC1395_NORMAL_DNA gs://BUCKET/sequence_data/Exome_Norm/Exome_Norm_R1.fastq.gz gs://BUCKET/sequence_data/Exome_Norm/Exome_Norm_R2.fastq.gz @RG\tID:2\tPU:C1TD1ACXX\tSM:HCC1395_NORMAL_DNA\tLB:HCC1395_NORMAL_DNA_LIB1\tPL:Illumina\tCN:WUGSC -3 HCC1395_TUMOR_RNA gs://BUCKET/sequence_data/RNAseq_Tumor/RNAseq_Tumor_Lane1_R1.fastq.gz gs://BUCKET/sequence_data/RNAseq_Tumor/RNAseq_Tumor_Lane1_R2.fastq.gz "ID:3 PU:H3MYFBBXX.4 SM:HCC1395_TUMOR_RNA LB:HCC1395_TUMOR_RNA_LIB1 PL:Illumina CN:WUGSC" -4 HCC1395_TUMOR_RNA gs://BUCKET/sequence_data/RNAseq_Tumor/RNAseq_Tumor_Lane2_R1.fastq.gz gs://BUCKET/sequence_data/RNAseq_Tumor/RNAseq_Tumor_Lane2_R2.fastq.gz "ID:4 PU:H3MYFBBXX.5 SM:HCC1395_TUMOR_RNA LB:HCC1395_TUMOR_RNA_LIB1 PL:Illumina CN:WUGSC" -``` - -The bucket paths need to be updated to point to the uploaded files for this workspace by filling in the `BUCKET`. Notice the headers of this file... the first column starts with `entity:` to indicate this column identifies the table. Since it says `sequence_id` the table will be named "sequence". For this table, the file import must be used as the text uploader does not handle fields that contain tabs. - -![import file](resources/images/terra_screens/data-import_file.png) - -### Adding the Sample table. - -This time the text import can be used by copying this block: - -``` -entity:sample_id individual_id type -HCC1395_TUMOR_DNA HCC1395 tumor_dna -HCC1395_NORMAL_DNA HCC1395 normal_dna -HCC1395_TUMOR_RNA HCC1395 tumor_rna -``` - -This will create a "sample" table. - -![import text](resources/images/terra_screens/data-import_text.png) - - -### Adding the Analysis table. - -As before, open the TSV import and copy this block: - -``` -entity:analysis_id individual_id tumor_dna_sample normal_dna_sample tumor_rna_sample -HCC1395 HCC1395 {"entityType":"sample","entityName":"HCC1395_NORMAL_DNA"} {"entityType":"sample","entityName":"HCC1395_TUMOR_DNA"} {"entityType":"sample","entityName":"HCC1395_TUMOR_RNA"} -``` - -Notice this table has some entities in the columns. This will automatically link those entries to the "sample" table. This allows access to properties of the "sample" table via this table. If there were another table for individuals, the "sample" table could've been created with links to that table for further nested indirect access. - -### Linking to sequences in the analysis table. - -Now that the basic tables are in place, there are a few more steps to do before using the workflow. First is to add columns to the analysis table to point to which sequence applies. From the "analysis" table, there is an edit button with an option to "Add Column". Use it three times, one for each sample. - -![edit analysis table](resources/images/terra_screens/data-edit_analysis_table.png) - -Each time, use a type of "reference" to the "sequence" table and select the option that the values are a list. - -The three columns to add and their values: - -| column | value | -| -------------------- | ----- | -| tumor_dna_sequences | 1 | -| normal_dna_sequences | 2 | -| tumor_rna_sequences | 3,4 | - -The option to use a list is selected every time because the workflow expects a list of data, even if some lists only contain one item. - -![add column form](resources/images/terra_screens/data-add_analysis_column.png) - -As a note, although this walkthrough used the Terra UI to demonstrate adding a column to the workflow, it would've been possible to create these entities in the TSV directly and skip this step so long as the column is formatted correctly. An example of how to format a list in the TSV: - -``` -[{"entityType":"sequence","entityName":"1"},{"entityType":"sequence","entityName":"2"}] -``` - -### Formatting a sequence entry in the sequence table. - -The Terra interface does not support adding a JSON object directly as a single column unless that object is the output of a workflow. Therefore a separate zero-step WDL workflow is needed in order to format the sequence objects in the way the `immuno.wdl` expects. This is a great opportunity to see a simple case of setting up a workflow before moving on to the full immuno workflow. - -#### Pulling the sequence formatter workflow from Dockstore - -First, visit the dockstore page for the workflow: - -* [sequence-object-consolidator](https://dockstore.org/workflows/github.com/tmooney/sequence-object-consolidator:master?tab=info) - -At the right side of this page is a "Launch with..." section. - -![dockstore launch with section](resources/images/terra_screens/dockstore-launch_with.png) - - Choosing Terra opens a Terra page to select a workspace. - -![terra workflow import](resources/images/terra_screens/workflow-import_from_dockstore.png) - -Once the correct workspace is chosen Terra will open the Workflows tab. This is the main page for configuring the many options for a WDL workflow. - -#### Selecting the workflow data. - -In this case, the "Run workflow(s) with inputs defined by data table" is used to point to the data that was set up. Choose "sequence" for the table to run on. - -![select table](resources/images/terra_screens/workflows-select_sequence_table.png) - -Next there is a "SELECT DATA" button. In the dialogue, select all four of the sequence entities. - -![select table data](resources/images/terra_screens/workflows-select_sequence_data.png) - -This will automatically define a sequence set containing these four items. You can give it a custom name if desired or accept the default. - -#### Setting the workflow options. - -The call caching doesn't matter for this workflow, so the default is fine. There are no intermediate outputs, but it's a good idea to be in the habit of remembering to select the box to delete them, so go ahead and mark it. The other options can remain unselected--we're not using reference data, this workflow has no steps, so there is nothing to retry, and we shouldn't have any empty outputs possible. - -![workflow options](resources/images/terra_screens/workflows-options.png) - -These options reset every time, so be sure to check that they are configured appropriately before each run. Forgetting to delete intermediate outputs can be expensive over time! - -#### Setting the workflow inputs. - -The inputs to this workflow match the columns in the sequence table, so each input maps to the corresponding column by prepending it with `this.`: - -| input | value | -| --------- | -------------- | -| fastq1 | this.fastq1 | -| fastq2 | this.fastq2 | -| readgroup | this.readgroup | - -![workflow inputs](resources/images/terra_screens/workflows-sequence_consolidator_inputs.png) - -The workflow will run once per sequence entity selected above. For each run, `this` will refer to one row in the table and pull the appropriate value. - -#### Setting the workflow outputs. - -This workflow has only one output, so wire it up like the inputs. - -| output | value | -| ------------- | ------------------ | -| sequence_data | this.sequence_data | - -![workflow outputs](resources/images/terra_screens/workflows-sequence_consolidator_outputs.png) - -When the workflow runs this will automatically add a column to the sequence table named "sequence_data" with the output. As with the inputs, this happens once per row to fully populate the new column. - -#### Launching the workflows. - -Since four rows were selected from the sequence table, this will launch four workflows--one per row. A dialogue after clicking "RUN ANALYSIS" will confirm this number. And then it will launch! - -![confirm launch](resources/images/terra_screens/workflows-confirm_launch.png) - -#### Seeing the results. - -These workflows complete quickly so you should soon receive an e-mail with the success or failure status. Assuming they succeeded, returning to the Data tab and selecting the sequence table should show the additional column with the sequence data objects properly formatted for the `immuno.wdl` workflow. - -![sequence data with new column](resources/images/terra_screens/data-sequence_with_new_column.png) - -## Setting up the `immuno.wdl` workflow. - -Now that all the input data is ready, it's time to do the configure the `immuno.wdl` workflow. - -### Pulling the `immuno.wdl` workflow from Dockstore - -First visit the dockstore page for the workflow: - -* [immuno.wdl](https://dockstore.org/workflows/github.com/wustl-oncology/analysis-wdls/immuno) - -As before, choose the "Launch with" option for "Terra". - -### Selecting the workflow data. - -As before, choose "Run workflow(s) with inputs defined by data table", but this time choose the "analysis" table. In the "SELECT DATA" dialogue choose the one and only row, for HCC1395. - -### Setting the workflow options. - -Call caching can be turned on if desired--should the workflow fail this will allow it to shortcut previously successful steps. Once again it's a good idea to select the "Delete intermediate outputs" option--though note that call caching won't work once a run completes and the intermediate outputs have been deleted. The other options can remain unselected. - -### Setting the workflow inputs. - -#### Static inputs - -This time there are far more inputs to deal with. Thankfully, most of them are static values. If starting from scratch there is a link to download a JSON template that can be filled in and re-uploaded. An existing JSON from a manual run works, too. For this example, there's [an existing YAML](https://github.com/wustl-oncology/immuno_gcp_wdl_compute1/blob/main/example_yamls/human_GRCh38_ens105/hcc1395_immuno_cloud-WDL.yaml) that can be used. It will need to be converted to JSON for use in Terra. There are many ways to do this conversion; one is an online tool like [this one](https://jsonformatter.org/yaml-to-json). Using the "Upload JSON" we'll use this to fill in most of the inputs. - -#### Linking to the data table - -Several inputs need link to the data table, so these will need to be filled in to replace the values from the uploaded JSON (if any were set). - -| input | value | -| ------------------ | --------------------------------------- | -| normal_sample_name | this.normal_dna_sample.sample_id | -| normal_sequence | this.normal_dna_sequences.sequence_data | -| rna_sequence | this.tumor_rna_sequences.sequence_data | -| sample_name | this.tumor_rna_sample.sample_id | -| tumor_sample_name | this.tumor_dna_sample.sample_id | -| tumor_sequence | this.tumor_dna_sequences.sequence_data | - -This shows how when a table contains references to another one, you can refer to columns of the other tables through the relationships established. - -### Setting the workflow outputs. - -Every output from the workflow can be added as a column to the datatable. Choose "Use defaults" to automatically populated all the outputs to columns of the same name. - -### Launching the workflow - -Now we're ready to "Launch Analysis". This time, we only have one row so it will launch a single workflow to do the work. For this immuno.wdl, it will take a few days to run to completion on the HCC1395, so now we wait. Terra should send an e-mail once the workflow either succeeds or fails. - -### Viewing the results - -After the run, there are several ways to examine the results. - -#### In the data table - -The outputs that were specified will be added as columns to the "analysis" datatable. - -![filled in analysis data table](resources/images/terra_screens/data-analysis_full.png) - -#### From the command-line - -This table is now very wide! It may be useful to select the row and export it as a TSV to view. - -![export to tsv](resources/images/terra_screens/data-export_to_tsv.png) - -There is also a command-line tool to pull data from a terra workspace. To pull the analysis table: - -``` -pip install firecloud -fissfc entity_tsv -p TERRA_BILLING_PROJECT -w WORKSPACE -t analysis -m flexible -``` - -For more about this command, see [this Terra documentation page](https://support.terra.bio/hc/en-us/articles/360042259232-How-to-Manage-data-with-the-FISS-API). - -#### In the Job History - -For a more comprehensive view of what happened in the workflow, the Job History view can be used. - -![job history listings](resources/images/terra_screens/jobhistory-overview.png) - -This will have the attempts to run both the immuno and sequence-object-consolidator workflows. Clicking on the immuno workflow will load a details page for digging in. - -![job history details](resources/images/terra_screens/jobhistory-details.png) - -The "Job Manager" link on this page will show the inputs, outputs, and timings of each step in the workflow and the "Execution Directory" link will lead to the Google bucket where cromwell did its work. If intermediate outputs were not set to be deleted, they will also be in this location along with the final results. - -For more on the job manager, see [this Terra documentation page](https://support.terra.bio/hc/en-us/articles/360037096272-Job-History-overview-monitoring-workflows-). - -## Further information - -Terra maintains [extensive documentation](https://support.terra.bio/hc/en-us/categories/360001399872-Documentation) including some video walkthroughs of the basics of using various features of the platform. -