From c03a6c17495351950d812c262b5a24c48ebe8982 Mon Sep 17 00:00:00 2001 From: Kyle Ellrott Date: Tue, 26 Jun 2018 18:16:53 -0700 Subject: [PATCH] Improving matrix load code and instructions (#125) * Updating TCGA tutorial * Adding more options for loading matrix into graph * Adding option to load edges without vertices when loading matrix. * Adding more matrix loading instructions * Updating website --- docs/docs/databases/elastic/index.html | 2 +- docs/docs/databases/index.html | 2 +- docs/docs/databases/kvstore/index.html | 2 +- docs/docs/databases/mongo/index.html | 2 +- docs/docs/databases/sql/index.html | 2 +- docs/docs/developers/index.html | 2 +- docs/docs/index.html | 2 +- docs/docs/index.xml | 10 +- docs/docs/queries/getting_started/index.html | 2 +- docs/docs/queries/graphql/index.html | 2 +- docs/docs/queries/index.html | 2 +- docs/docs/queries/jsonpath/index.html | 2 +- docs/docs/queries/operations/index.html | 2 +- docs/docs/security/basic/index.html | 2 +- docs/docs/security/index.html | 2 +- docs/docs/tutorials/amazon/index.html | 2 +- docs/docs/tutorials/index.html | 2 +- docs/docs/tutorials/tcga-rna/index.html | 109 +++++++++++++---- docs/download/index.html | 2 +- docs/index.html | 2 +- docs/index.xml | 10 +- example/load_matrix.py | 122 ++++++++++++++++--- website/content/docs/tutorials/tcga-rna.md | 107 ++++++++++++---- 23 files changed, 301 insertions(+), 93 deletions(-) diff --git a/docs/docs/databases/elastic/index.html b/docs/docs/databases/elastic/index.html index 272c9919..518abd76 100644 --- a/docs/docs/databases/elastic/index.html +++ b/docs/docs/databases/elastic/index.html @@ -3,7 +3,7 @@ - + diff --git a/docs/docs/databases/index.html b/docs/docs/databases/index.html index 85678af4..125132e8 100644 --- a/docs/docs/databases/index.html +++ b/docs/docs/databases/index.html @@ -3,7 +3,7 @@ - + diff --git a/docs/docs/databases/kvstore/index.html b/docs/docs/databases/kvstore/index.html index 07201525..5fc9a014 100644 --- a/docs/docs/databases/kvstore/index.html +++ b/docs/docs/databases/kvstore/index.html @@ -3,7 +3,7 @@ - + diff --git a/docs/docs/databases/mongo/index.html b/docs/docs/databases/mongo/index.html index c0bf9860..8fb43a65 100644 --- a/docs/docs/databases/mongo/index.html +++ b/docs/docs/databases/mongo/index.html @@ -3,7 +3,7 @@ - + diff --git a/docs/docs/databases/sql/index.html b/docs/docs/databases/sql/index.html index 5d3e0190..01ec6a54 100644 --- a/docs/docs/databases/sql/index.html +++ b/docs/docs/databases/sql/index.html @@ -3,7 +3,7 @@ - + diff --git a/docs/docs/developers/index.html b/docs/docs/developers/index.html index ddcc0719..ba19972c 100644 --- a/docs/docs/developers/index.html +++ b/docs/docs/developers/index.html @@ -3,7 +3,7 @@ - + diff --git a/docs/docs/index.html b/docs/docs/index.html index 7603e914..022070e4 100644 --- a/docs/docs/index.html +++ b/docs/docs/index.html @@ -3,7 +3,7 @@ - + diff --git a/docs/docs/index.xml b/docs/docs/index.xml index f471fac4..838e9477 100644 --- a/docs/docs/index.xml +++ b/docs/docs/index.xml @@ -173,11 +173,11 @@ Configuration Notes DataSourceName is a driver-specific data source name, usual https://docs.bmeg.io/arachne/docs/tutorials/tcga-rna/ Explore TCGA RNA Expression Data Create the graph -arachne create tcga-rna Load pathway information -curl -O http://www.pathwaycommons.org/archives/PC2/v9/PathwayCommons9.All.hgnc.sif.gz gunzip PathwayCommons9.All.hgnc.sif.gz python $GOPATH/src/github.com/bmeg/arachne/example/load_sif.py --db tcga-rna PathwayCommons9.All.hgnc.sif Load expression data -curl -O https://tcga.xenahubs.net/download/TCGA.BRCA.sampleMap/HiSeqV2.gz gunzip HiSeqV2.gz python $GOPATH/src/github.com/bmeg/arachne/example/load_matrix.py --db tcga-rna HiSeqV2 Load clinical information -curl -O https://tcga.xenahubs.net/download/TCGA.BRCA.sampleMap/BRCA_clinicalMatrix.gz gunzip BRCA_clinicalMatrix.gz python $GOPATH/src/github.com/bmeg/arachne/example/load_property_matrix.py --db tcga-rna BRCA_clinicalMatrix Query the graph -pip install "git+https://github.com/bmeg/arachne.git#egg=aql&subdirectory=aql/python/" import aql conn = aql.Connection("http://localhost:8201") O = conn.graph("tcga-rna") # Print out expression data of all Stage IIA samples for row in O. +arachne create tcga-rna Get the data +curl -O http://download.cbioportal.org/gbm_tcga_pub2013.tar.gz tar xvzf gbm_tcga_pub2013.tar.gz Load clinical data +./example/load_matrix.py tcga-rna gbm_tcga_pub2013/data_clinical.txt --row-label 'Donor' Load RNASeq data +./example/load_matrix.py tcga-rna gbm_tcga_pub2013/data_RNA_Seq_v2_expression_median.txt -t --index-col 1 --row-label RNASeq --row-prefix "RNA:" --exclude RNA:Hugo_Symbol Connect RNASeq data to Clinical data +./example/load_matrix.py tcga-rna gbm_tcga_pub2013/data_RNA_Seq_v2_expression_median.txt -t --index-col 1 --no-vertex --edge 'RNA:{_gid}' rna Connect Clinical data to subtypes diff --git a/docs/docs/queries/getting_started/index.html b/docs/docs/queries/getting_started/index.html index 7fe61e83..ddf13446 100644 --- a/docs/docs/queries/getting_started/index.html +++ b/docs/docs/queries/getting_started/index.html @@ -3,7 +3,7 @@ - + diff --git a/docs/docs/queries/graphql/index.html b/docs/docs/queries/graphql/index.html index 23e49a27..8e922eac 100644 --- a/docs/docs/queries/graphql/index.html +++ b/docs/docs/queries/graphql/index.html @@ -3,7 +3,7 @@ - + diff --git a/docs/docs/queries/index.html b/docs/docs/queries/index.html index 4cc57351..51ee5f5d 100644 --- a/docs/docs/queries/index.html +++ b/docs/docs/queries/index.html @@ -3,7 +3,7 @@ - + diff --git a/docs/docs/queries/jsonpath/index.html b/docs/docs/queries/jsonpath/index.html index 8b7bf8a4..57a55097 100644 --- a/docs/docs/queries/jsonpath/index.html +++ b/docs/docs/queries/jsonpath/index.html @@ -3,7 +3,7 @@ - + diff --git a/docs/docs/queries/operations/index.html b/docs/docs/queries/operations/index.html index 5d470daf..55154720 100644 --- a/docs/docs/queries/operations/index.html +++ b/docs/docs/queries/operations/index.html @@ -3,7 +3,7 @@ - + diff --git a/docs/docs/security/basic/index.html b/docs/docs/security/basic/index.html index db130cd5..cc129ca0 100644 --- a/docs/docs/security/basic/index.html +++ b/docs/docs/security/basic/index.html @@ -3,7 +3,7 @@ - + diff --git a/docs/docs/security/index.html b/docs/docs/security/index.html index 627dd5f1..a500920d 100644 --- a/docs/docs/security/index.html +++ b/docs/docs/security/index.html @@ -3,7 +3,7 @@ - + diff --git a/docs/docs/tutorials/amazon/index.html b/docs/docs/tutorials/amazon/index.html index 36aa69cb..0c925012 100644 --- a/docs/docs/tutorials/amazon/index.html +++ b/docs/docs/tutorials/amazon/index.html @@ -3,7 +3,7 @@ - + diff --git a/docs/docs/tutorials/index.html b/docs/docs/tutorials/index.html index 87f27fe7..67809bca 100644 --- a/docs/docs/tutorials/index.html +++ b/docs/docs/tutorials/index.html @@ -3,7 +3,7 @@ - + diff --git a/docs/docs/tutorials/tcga-rna/index.html b/docs/docs/tutorials/tcga-rna/index.html index 005f686a..4b8f1298 100644 --- a/docs/docs/tutorials/tcga-rna/index.html +++ b/docs/docs/tutorials/tcga-rna/index.html @@ -3,7 +3,7 @@ - + @@ -230,44 +230,107 @@

Explore TCGA RNA Expression Data

arachne create tcga-rna
 
-

Load pathway information

+

Get the data

-
curl -O http://www.pathwaycommons.org/archives/PC2/v9/PathwayCommons9.All.hgnc.sif.gz
-gunzip PathwayCommons9.All.hgnc.sif.gz
-python $GOPATH/src/github.com/bmeg/arachne/example/load_sif.py --db tcga-rna PathwayCommons9.All.hgnc.sif
+
curl -O http://download.cbioportal.org/gbm_tcga_pub2013.tar.gz
+tar xvzf gbm_tcga_pub2013.tar.gz
 
-

Load expression data

+

Load clinical data

-
curl -O https://tcga.xenahubs.net/download/TCGA.BRCA.sampleMap/HiSeqV2.gz
-gunzip HiSeqV2.gz
-python $GOPATH/src/github.com/bmeg/arachne/example/load_matrix.py --db tcga-rna HiSeqV2
+
./example/load_matrix.py tcga-rna gbm_tcga_pub2013/data_clinical.txt --row-label 'Donor'
 
-

Load clinical information

+

Load RNASeq data

-
curl -O https://tcga.xenahubs.net/download/TCGA.BRCA.sampleMap/BRCA_clinicalMatrix.gz
-gunzip BRCA_clinicalMatrix.gz
-python $GOPATH/src/github.com/bmeg/arachne/example/load_property_matrix.py --db tcga-rna BRCA_clinicalMatrix
+
./example/load_matrix.py tcga-rna gbm_tcga_pub2013/data_RNA_Seq_v2_expression_median.txt -t  --index-col 1 --row-label RNASeq --row-prefix "RNA:" --exclude RNA:Hugo_Symbol
 
-

Query the graph

+

Connect RNASeq data to Clinical data

-
pip install "git+https://github.com/bmeg/arachne.git#egg=aql&subdirectory=aql/python/"
+
./example/load_matrix.py tcga-rna gbm_tcga_pub2013/data_RNA_Seq_v2_expression_median.txt -t  --index-col 1 --no-vertex --edge 'RNA:{_gid}' rna
 
-
import aql
+

Connect Clinical data to subtypes

+ +
./example/load_matrix.py tcga-rna gbm_tcga_pub2013/data_clinical.txt --no-vertex -e "{EXPRESSION_SUBTYPE}" subtype --dst-vertex "{EXPRESSION_SUBTYPE}" Subtype
+
+ +

Load EntrezID to Hugo Symbol mapping

+ +
./example/load_matrix.py tcga-rna gbm_tcga_pub2013/data_RNA_Seq_v2_expression_median.txt --index-col 1 --column-include Hugo_Symbol --row-label Gene
+
+ +

Load Proneural samples into a matrix

+ +
import pandas
+import aql
 
 conn = aql.Connection("http://localhost:8201")
 O = conn.graph("tcga-rna")
+genes = {}
+for k, v in O.query().V().where(aql.eq("_label", "Gene")).render(["_gid", "Hugo_Symbol"]):
+    genes[k] = v
+data = {}
+for row in O.query().V("Proneural").in_().out("rna").render(["_gid", "_data"]):
+    data[row[0]] = row[1]
+samples = pandas.DataFrame(data).rename(genes).transpose().fillna(0.0)
+
+ +

Matrix Load project

+ +
usage: load_matrix.py [-h] [--sep SEP] [--server SERVER]
+                      [--row-label ROW_LABEL] [--row-prefix ROW_PREFIX] [-t]
+                      [--index-col INDEX_COL] [--connect]
+                      [--col-label COL_LABEL] [--col-prefix COL_PREFIX]
+                      [--edge-label EDGE_LABEL] [--edge-prop EDGE_PROP]
+                      [--columns [COLUMNS [COLUMNS ...]]]
+                      [--column-include COLUMN_INCLUDE] [--no-vertex]
+                      [-e EDGE EDGE] [--dst-vertex DST_VERTEX DST_VERTEX]
+                      [-x EXCLUDE] [-d]
+                      db input
+
+positional arguments:
+  db                    Destination Graph
+  input                 Input File
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --sep SEP             TSV delimiter
+  --server SERVER       Server Address
+  --row-label ROW_LABEL
+                        Vertex Label used when loading rows
+  --row-prefix ROW_PREFIX
+                        Prefix added to row vertex gid
+  -t, --transpose       Transpose matrix
+  --index-col INDEX_COL
+                        Column number to use as index (and gid for vertex
+                        load)
+  --connect             Switch to 'fully connected mode' and load matrix cell
+                        values on edges between row and column names
+  --col-label COL_LABEL
+                        Column vertex label in 'connect' mode
+  --col-prefix COL_PREFIX
+                        Prefix added to col vertex gid in 'connect' mode
+  --edge-label EDGE_LABEL
+                        Edge label for edges in 'connect' mode
+  --edge-prop EDGE_PROP
+                        Property name for storing value when in 'connect' mode
+  --columns [COLUMNS [COLUMNS ...]]
+                        Rename columns in TSV
+  --column-include COLUMN_INCLUDE
+                        List subset of columns to use from TSV
+  --no-vertex           Do not load row as vertex
+  -e EDGE EDGE, --edge EDGE EDGE
+                        Create an edge the connected the current row vertex
+                        args: <dst> <edgeType>
+  --dst-vertex DST_VERTEX DST_VERTEX
+                        Create a destination vertex, args: <dstVertex>
+                        <vertexLabel>
+  -x EXCLUDE, --exclude EXCLUDE
+                        Exclude row id
+  -d                    Run in debug mode. Print actions and make no changes
 
-# Print out expression data of all Stage IIA samples
-for row in O.query().\
-    V().\
-    where(aql.and_(aql.eq("_label", "Sample"), aql.eq("pathologic_stage", "Stage IIA"))).\
-    out("has").\
-    where(aql.eq("_label", "Data:Expression"):
-  print row
 
diff --git a/docs/download/index.html b/docs/download/index.html index 579bcbf8..71396359 100644 --- a/docs/download/index.html +++ b/docs/download/index.html @@ -3,7 +3,7 @@ - + diff --git a/docs/index.html b/docs/index.html index 91113a5b..22068faf 100644 --- a/docs/index.html +++ b/docs/index.html @@ -4,7 +4,7 @@ - + diff --git a/docs/index.xml b/docs/index.xml index d5b75d6a..e6a4f5e2 100644 --- a/docs/index.xml +++ b/docs/index.xml @@ -194,11 +194,11 @@ Configuration Notes DataSourceName is a driver-specific data source name, usual https://docs.bmeg.io/arachne/docs/tutorials/tcga-rna/ Explore TCGA RNA Expression Data Create the graph -arachne create tcga-rna Load pathway information -curl -O http://www.pathwaycommons.org/archives/PC2/v9/PathwayCommons9.All.hgnc.sif.gz gunzip PathwayCommons9.All.hgnc.sif.gz python $GOPATH/src/github.com/bmeg/arachne/example/load_sif.py --db tcga-rna PathwayCommons9.All.hgnc.sif Load expression data -curl -O https://tcga.xenahubs.net/download/TCGA.BRCA.sampleMap/HiSeqV2.gz gunzip HiSeqV2.gz python $GOPATH/src/github.com/bmeg/arachne/example/load_matrix.py --db tcga-rna HiSeqV2 Load clinical information -curl -O https://tcga.xenahubs.net/download/TCGA.BRCA.sampleMap/BRCA_clinicalMatrix.gz gunzip BRCA_clinicalMatrix.gz python $GOPATH/src/github.com/bmeg/arachne/example/load_property_matrix.py --db tcga-rna BRCA_clinicalMatrix Query the graph -pip install &quot;git+https://github.com/bmeg/arachne.git#egg=aql&amp;subdirectory=aql/python/&quot; import aql conn = aql.Connection(&quot;http://localhost:8201&quot;) O = conn.graph(&quot;tcga-rna&quot;) # Print out expression data of all Stage IIA samples for row in O. +arachne create tcga-rna Get the data +curl -O http://download.cbioportal.org/gbm_tcga_pub2013.tar.gz tar xvzf gbm_tcga_pub2013.tar.gz Load clinical data +./example/load_matrix.py tcga-rna gbm_tcga_pub2013/data_clinical.txt --row-label 'Donor' Load RNASeq data +./example/load_matrix.py tcga-rna gbm_tcga_pub2013/data_RNA_Seq_v2_expression_median.txt -t --index-col 1 --row-label RNASeq --row-prefix &quot;RNA:&quot; --exclude RNA:Hugo_Symbol Connect RNASeq data to Clinical data +./example/load_matrix.py tcga-rna gbm_tcga_pub2013/data_RNA_Seq_v2_expression_median.txt -t --index-col 1 --no-vertex --edge 'RNA:{_gid}' rna Connect Clinical data to subtypes diff --git a/example/load_matrix.py b/example/load_matrix.py index 7f00c121..44863bbc 100755 --- a/example/load_matrix.py +++ b/example/load_matrix.py @@ -3,6 +3,7 @@ from __future__ import print_function import aql +import re import argparse import pandas import math @@ -10,33 +11,116 @@ def load_matrix(args): conn = aql.Connection(args.server) + if args.db not in list(conn.listGraphs()): + conn.addGraph(args.db) O = conn.graph(args.db) - matrix = pandas.read_csv(args.input, sep="\t", index_col=0).transpose() + if args.columns is not None: + matrix = pandas.read_csv(args.input, sep=args.sep, index_col=args.index_col, header=None, names=args.columns, skiprows=args.skiprows) + else: + matrix = pandas.read_csv(args.input, sep=args.sep, index_col=args.index_col, skiprows=args.skiprows) + if args.transpose: + matrix = matrix.transpose() - for c in matrix.columns: - if list(O.query().V(c).count())[0]['count'] == 0: - O.addVertex(c, "Protein") + if args.connect: + if not args.no_vertex: + #every row x col creates an edge with the weight value + for c in matrix.columns: + cname = "%s%s" % (args.col_prefix, c) + if list(O.query().V(c).count())[0]['count'] == 0: + if args.debug: + print("AddVertex %s %s" % (c, args.col_label)) + else: + O.addVertex(c, args.col_label) + for r in matrix.index: + rname = "%s%s" % (args.row_prefix, r) + if list(O.query().V(r).count())[0]['count'] == 0: + if args.debug: + print("AddVertex %s %s" % (r, args.row_label)) + else: + O.addVertex(r, args.row_label) + + for name, row in matrix.iterrows(): + rname = "%s%s" % (args.row_prefix, name) + print("Loading: %s" % (rname)) + b = O.bulkAdd() + for c in matrix.columns: + cname = "%s%s" % (args.col_prefix, c) + v = row[c] + if not math.isnan(v): + if args.debug: + print("AddEdge: %s %s %s %s" % (rname,cname,args.edge_label,{args.edge_prop:v}) ) + else: + b.addEdge(rname, cname, args.edge_label, {args.edge_prop:v}) + b.execute() + else: + for name, row in matrix.iterrows(): + rname = "%s%s" % (args.row_prefix, name) + print("Loading: %s" % (rname)) + data = {} + for c in matrix.columns: + v = row[c] + if args.column_include is None or c in args.column_include: + if not isinstance(v,float) or not math.isnan(v): + data[c] = v + if not args.no_vertex and rname not in args.exclude: + if args.debug: + print("Add Vertex %s %s %s" % (rname, args.row_label, data)) + else: + O.addVertex(rname, args.row_label, data) + data["_gid"] = rname + for dst, edge in args.edge: + try: + dstFmt = dst.format(**data) + except KeyError: + dstFmt = None + if dstFmt is not None: + if args.debug: + print("Add Edge %s %s" % (dstFmt, edge)) + else: + O.addEdge(rname, dstFmt, edge) + for dst, label in args.dst_vertex: + try: + dstFmt = dst.format(**data) + except KeyError: + dstFmt = None + if dstFmt is not None: + if list(O.query().V(dstFmt).count())[0]['count'] == 0: + if args.debug: + print("Add Vertex %s %s" % (dstFmt, label)) + else: + O.addVertex(dstFmt, label, {}) - for name, row in matrix.iterrows(): - src = "%s:%s" % (args.data_type, name) - print("Loading: %s" % (src)) - data = {} - for c in matrix.columns: - v = row[c] - if not math.isnan(v): - data[c] = v - O.addVertex(name, "Sample") - O.addVertex(src, "Data:%s" % (args.data_type), data) - O.addEdge(name, src, "has") if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("input") - parser.add_argument("--server", default="http://localhost:8201") - parser.add_argument("--data-type", dest="data_type", default="Expression") - parser.add_argument("--db", required=True) + parser.add_argument("db", help="Destination Graph") + parser.add_argument("input", help="Input File") + parser.add_argument("--sep", default="\t", help="TSV delimiter") + parser.add_argument("--server", default="http://localhost:8201", help="Server Address") + parser.add_argument("--row-label", dest="row_label", default="Row", help="Vertex Label used when loading rows") + parser.add_argument("--row-prefix", default="", help="Prefix added to row vertex gid") + parser.add_argument("-t", "--transpose", action="store_true", default=False, help="Transpose matrix") + parser.add_argument("--index-col", default=0, type=int, help="Column number to use as index (and gid for vertex load)") + parser.add_argument("--skiprows", default=None, type=int, help="Skip rows at top of file") + parser.add_argument("--connect", action="store_true", default=False, help="Switch to 'fully connected mode' and load matrix cell values on edges between row and column names") + parser.add_argument("--col-label", dest="col_label", default="Col", help="Column vertex label in 'connect' mode") + parser.add_argument("--col-prefix", default="", help="Prefix added to col vertex gid in 'connect' mode") + parser.add_argument("--edge-label", dest="edge_label", default="weight", help="Edge label for edges in 'connect' mode") + parser.add_argument("--edge-prop", dest="edge_prop", default="w", help="Property name for storing value when in 'connect' mode") + + parser.add_argument("--columns", default=None, nargs="*", help="Rename columns in TSV") + parser.add_argument("--column-include", default=None, action="append", help="List subset of columns to use from TSV") + + parser.add_argument("--no-vertex", action="store_true", default=False, help="Do not load row as vertex") + parser.add_argument("-e", "--edge", action="append", default=[], nargs=2, help="Create an edge the connected the current row vertex args: ") + parser.add_argument("--dst-vertex", action="append", default=[], nargs=2, help="Create a destination vertex, args: ") + parser.add_argument("-x", "--exclude", action="append", default=[], help="Exclude row id") + + parser.add_argument("-d", dest="debug", action="store_true", default=False, help="Run in debug mode. Print actions and make no changes") args = parser.parse_args() + if args.index_col < 0: + args.index_col = None load_matrix(args) diff --git a/website/content/docs/tutorials/tcga-rna.md b/website/content/docs/tutorials/tcga-rna.md index 2b4feffb..6684ecd1 100644 --- a/website/content/docs/tutorials/tcga-rna.md +++ b/website/content/docs/tutorials/tcga-rna.md @@ -15,47 +15,108 @@ Create the graph arachne create tcga-rna ``` -Load pathway information - +Get the data ``` -curl -O http://www.pathwaycommons.org/archives/PC2/v9/PathwayCommons9.All.hgnc.sif.gz -gunzip PathwayCommons9.All.hgnc.sif.gz -python $GOPATH/src/github.com/bmeg/arachne/example/load_sif.py --db tcga-rna PathwayCommons9.All.hgnc.sif +curl -O http://download.cbioportal.org/gbm_tcga_pub2013.tar.gz +tar xvzf gbm_tcga_pub2013.tar.gz ``` -Load expression data - +Load clinical data ``` -curl -O https://tcga.xenahubs.net/download/TCGA.BRCA.sampleMap/HiSeqV2.gz -gunzip HiSeqV2.gz -python $GOPATH/src/github.com/bmeg/arachne/example/load_matrix.py --db tcga-rna HiSeqV2 +./example/load_matrix.py tcga-rna gbm_tcga_pub2013/data_clinical.txt --row-label 'Donor' ``` -Load clinical information +Load RNASeq data +``` +./example/load_matrix.py tcga-rna gbm_tcga_pub2013/data_RNA_Seq_v2_expression_median.txt -t --index-col 1 --row-label RNASeq --row-prefix "RNA:" --exclude RNA:Hugo_Symbol +``` +Connect RNASeq data to Clinical data ``` -curl -O https://tcga.xenahubs.net/download/TCGA.BRCA.sampleMap/BRCA_clinicalMatrix.gz -gunzip BRCA_clinicalMatrix.gz -python $GOPATH/src/github.com/bmeg/arachne/example/load_property_matrix.py --db tcga-rna BRCA_clinicalMatrix +./example/load_matrix.py tcga-rna gbm_tcga_pub2013/data_RNA_Seq_v2_expression_median.txt -t --index-col 1 --no-vertex --edge 'RNA:{_gid}' rna ``` -Query the graph +Connect Clinical data to subtypes +``` +./example/load_matrix.py tcga-rna gbm_tcga_pub2013/data_clinical.txt --no-vertex -e "{EXPRESSION_SUBTYPE}" subtype --dst-vertex "{EXPRESSION_SUBTYPE}" Subtype +``` +Load EntrezID to Hugo Symbol mapping ``` -pip install "git+https://github.com/bmeg/arachne.git#egg=aql&subdirectory=aql/python/" +./example/load_matrix.py tcga-rna gbm_tcga_pub2013/data_RNA_Seq_v2_expression_median.txt --index-col 1 --column-include Hugo_Symbol --row-label Gene ``` + +Load Proneural samples into a matrix ```python +import pandas import aql conn = aql.Connection("http://localhost:8201") O = conn.graph("tcga-rna") +genes = {} +for k, v in O.query().V().where(aql.eq("_label", "Gene")).render(["_gid", "Hugo_Symbol"]): + genes[k] = v +data = {} +for row in O.query().V("Proneural").in_().out("rna").render(["_gid", "_data"]): + data[row[0]] = row[1] +samples = pandas.DataFrame(data).rename(genes).transpose().fillna(0.0) +``` + + +# Matrix Load project + +``` +usage: load_matrix.py [-h] [--sep SEP] [--server SERVER] + [--row-label ROW_LABEL] [--row-prefix ROW_PREFIX] [-t] + [--index-col INDEX_COL] [--connect] + [--col-label COL_LABEL] [--col-prefix COL_PREFIX] + [--edge-label EDGE_LABEL] [--edge-prop EDGE_PROP] + [--columns [COLUMNS [COLUMNS ...]]] + [--column-include COLUMN_INCLUDE] [--no-vertex] + [-e EDGE EDGE] [--dst-vertex DST_VERTEX DST_VERTEX] + [-x EXCLUDE] [-d] + db input + +positional arguments: + db Destination Graph + input Input File + +optional arguments: + -h, --help show this help message and exit + --sep SEP TSV delimiter + --server SERVER Server Address + --row-label ROW_LABEL + Vertex Label used when loading rows + --row-prefix ROW_PREFIX + Prefix added to row vertex gid + -t, --transpose Transpose matrix + --index-col INDEX_COL + Column number to use as index (and gid for vertex + load) + --connect Switch to 'fully connected mode' and load matrix cell + values on edges between row and column names + --col-label COL_LABEL + Column vertex label in 'connect' mode + --col-prefix COL_PREFIX + Prefix added to col vertex gid in 'connect' mode + --edge-label EDGE_LABEL + Edge label for edges in 'connect' mode + --edge-prop EDGE_PROP + Property name for storing value when in 'connect' mode + --columns [COLUMNS [COLUMNS ...]] + Rename columns in TSV + --column-include COLUMN_INCLUDE + List subset of columns to use from TSV + --no-vertex Do not load row as vertex + -e EDGE EDGE, --edge EDGE EDGE + Create an edge the connected the current row vertex + args: + --dst-vertex DST_VERTEX DST_VERTEX + Create a destination vertex, args: + + -x EXCLUDE, --exclude EXCLUDE + Exclude row id + -d Run in debug mode. Print actions and make no changes -# Print out expression data of all Stage IIA samples -for row in O.query().\ - V().\ - where(aql.and_(aql.eq("_label", "Sample"), aql.eq("pathologic_stage", "Stage IIA"))).\ - out("has").\ - where(aql.eq("_label", "Data:Expression"): - print row ```