diff --git a/DataFileGenerator.fsx b/DataFileGenerator.fsx index b58fa38..5fbd8b0 100644 --- a/DataFileGenerator.fsx +++ b/DataFileGenerator.fsx @@ -10,7 +10,7 @@ open FluentFTP // ------ Record types used for reading and writing files ------ // Rows for the original GenBank TSV file. -type GenBankRow = { +type FileRow = { assembly_accession : string bioproject : string biosample : string @@ -64,13 +64,42 @@ type SpeciesRow = { species_name : string } +/// Typed representation of an NCBI Database. NCBI contains two main genome databases +/// GenBank and RefSeq. +type DatabaseName = + | GenBank + | RefSeq + + // Returns the base path of the files of each database. Used to remove the + // necessary characters from the URLs in the original assembly list when + // creating the new lists. + member this.GetBasePath() = + match this with + | GenBank -> "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/" + | RefSeq -> "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/" + + // Returns the location of the assembly file on the FTP server for the + // database. Does not include the host path. + member this.GetAssemblyFilePath() = + match this with + | GenBank -> "/genomes/genbank/assembly_summary_genbank.txt" + | RefSeq -> "/genomes/refseq/assembly_summary_refseq.txt" + + // Returns the name of the database as a string. + member this.GetName() = + match this with + | GenBank -> "GenBank" + | RefSeq -> "RefSeq" + + // Returns the filename of the assembly file. + member this.GetFilename() = + match this with + | GenBank -> "assembly_summary_genbank.txt" + | RefSeq -> "assembly_summary_refseq.txt" + // Character array. let characters = Seq.concat [['#']; ['a' .. 'z']] -// Base URL for GenBank files on the FTP server. Used to delete the correct -// number of characters from the FTP path. -let genBankURL = "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/" - // ------ Functions for generating and writing data files ------ // Function for matching the first character of a species name. // Characters that are not letters are treated as a '#'. @@ -81,7 +110,7 @@ let getLookupCharacter (name: string) = // Generate a list of distinct species with a unique species ID number for // each, starting with the specified character. -let getSpeciesList (filteredList : GenBankRow list) (count : int) = +let getSpeciesList (filteredList : FileRow list) (count : int) = // Get a distinct list of species names. // Also sorts it into alphabetical order. let distinctList = List.sort (List.distinct (List.map (fun row -> row.organism_name) filteredList)) @@ -90,12 +119,12 @@ let getSpeciesList (filteredList : GenBankRow list) (count : int) = // Generate a list of assemblies belonging to the species of a specified // character, with the correct ID number for their species. -let getAssemblyList (filteredList : GenBankRow list) (speciesList : SpeciesRow list) = +let getAssemblyList (database : DatabaseName) (filteredList : FileRow list) (speciesList : SpeciesRow list) = // Function for finding a species name match for a certain row. let findNameMatch row = List.tryFind (fun species -> species.species_name.Equals(row.organism_name)) speciesList // Filter the CSV rows by those that have one of the organism names in the // supplied list, and that have a FTP path that isn't "na". - let listWithPaths = List.filter (fun (row : GenBankRow) -> not (row.ftp_path.Equals("na"))) filteredList + let listWithPaths = List.filter (fun (row : FileRow) -> not (row.ftp_path.Equals("na"))) filteredList // Function for sorting a list of AssemblyRows. It should be in the order // of species IDs, and then the accessions if the IDs are the same. let sortAssemblies (assembly1 : AssemblyRow) (assembly2: AssemblyRow) = @@ -103,7 +132,7 @@ let getAssemblyList (filteredList : GenBankRow list) (speciesList : SpeciesRow l | 0 -> assembly1.assembly_accession.CompareTo(assembly2.assembly_accession) | result -> result // Return a (sorted) list of AssemblyRows. - List.sortWith sortAssemblies (List.map (fun row -> { species_id = ((findNameMatch row).Value.species_id) ; assembly_accession = row.assembly_accession ; ftp_path = row.ftp_path.[(String.length genBankURL)..] } ) listWithPaths) + List.sortWith sortAssemblies (List.map (fun row -> { species_id = ((findNameMatch row).Value.species_id) ; assembly_accession = row.assembly_accession ; ftp_path = row.ftp_path.[(String.length (database.GetBasePath()))..] } ) listWithPaths) // Compresses a written text file using GZip compression, writes it to a new // file and deletes the original. @@ -130,7 +159,7 @@ let internal useNCBIConnection (callback) = // file. // - If a file doesn't exist, or is older: return to overwrite existing // file. -// - Otherwise: return to resume existing file (in case it wasn't +// - Otherwise: try to resume existing file (in case it wasn't // downloaded fully before). let isNewerFile (localPath: string) (remotePath: string) (connection: FtpClient) = if (not (File.Exists(localPath))) then @@ -154,65 +183,65 @@ let downloadNCBIFile (localPath: string, remotePath: string) = useNCBIConnection downloadFile -let downloadedFilePath = (Path.Combine(Path.GetTempPath(), "BioProviders_Build", "downloaded_list.txt")) - -// ------ Main operations ------ - -printfn "------------ Starting operations to generate GenBank data file lists for BioProviders. ------------" +// ------ Parsing operations ------ -printfn "------ Downloading GenBank summary file to %s...... ------" downloadedFilePath +// Download the corresponding assembly file from the GenBank FTP server and +// parse it into a set of records. +let getFtpList (database : DatabaseName) = + let downloadedFilePath = (Path.Combine(Path.GetTempPath(), "BioProviders_Build", (database.GetFilename()))) + printfn "Downloading %s summary file to %s..." (database.GetName()) downloadedFilePath -let status = downloadNCBIFile (downloadedFilePath, "/genomes/genbank/assembly_summary_genbank.txt") + let status = downloadNCBIFile (downloadedFilePath, (database.GetAssemblyFilePath())) -match status with - | FtpStatus.Failed -> failwith "------ Failed to download file from NCBI FTP server. ------" - | FtpStatus.Skipped -> printfn "------ File already downloaded. ------" - | _ -> printfn "------ File downloaded successfully. ------" + match status with + | FtpStatus.Failed -> failwith "Failed to download file from NCBI FTP server." + | FtpStatus.Skipped -> printfn "File already downloaded." + | _ -> printfn "File downloaded successfully." -printfn "------ Loading in GenBank assembly summary TSV... ------" + printfn "Loading in %s assembly summary TSV..." (database.GetName()) -// Load in the GenBank file. -(*let reader = new StreamReader("D:\\Users\\Samuel Smith_3\\Documents\\RA\\Downloads\\GenBank FTP\\assembly_summary_genbank_25-09-2023.txt")*) -let reader = new StreamReader(downloadedFilePath) + // Load in the GenBank file. + (*let reader = new StreamReader("D:\\Users\\Samuel Smith_3\\Documents\\RA\\Downloads\\GenBank FTP\\assembly_summary_genbank_25-09-2023.txt")*) + let reader = new StreamReader(downloadedFilePath) -// A function to skip lines that start with ##, to ignore the comment. -let skipFunction (args : ShouldSkipRecordArgs) = - args.Row[0].StartsWith("##") + // A function to skip lines that start with ##, to ignore the comment. + let skipFunction (args : ShouldSkipRecordArgs) = + args.Row[0].StartsWith("##") -// Configuration for the CSV reader. It: -// - Chooses tab as the delimiter; -// - Sets the mode to no escape to ignore quotes; -// - Uses the above function to skip comment lines; and -// - Clear the # symbol on any headers. -let config = new CsvConfiguration(CultureInfo.InvariantCulture) -config.Delimiter <- "\t" -config.Mode <- CsvMode.NoEscape -config.ShouldSkipRecord <- new ShouldSkipRecord(skipFunction) -config.PrepareHeaderForMatch <- fun args -> args.Header.TrimStart('#') + // Configuration for the CSV reader. It: + // - Chooses tab as the delimiter; + // - Sets the mode to no escape to ignore quotes; + // - Uses the above function to skip comment lines; and + // - Clear the # symbol on any headers. + let config = new CsvConfiguration(CultureInfo.InvariantCulture) + config.Delimiter <- "\t" + config.Mode <- CsvMode.NoEscape + config.ShouldSkipRecord <- new ShouldSkipRecord(skipFunction) + config.PrepareHeaderForMatch <- fun args -> args.Header.TrimStart('#') -// Create a CSV reader object and get all records in the loaded file. -let csv = new CsvReader(reader, config) -let records = Seq.toList (csv.GetRecords()) + // Create a CSV reader object and get all records in the loaded file. + let csv = new CsvReader(reader, config) + let records = Seq.toList (csv.GetRecords()) -// Show how many records were loaded. -printfn "Loaded %i records." (List.length records) -printfn "------ TSV loaded successfully. ------" + // Show how many records were loaded. + printfn "%s TSV loaded successfully with a total of %i records." (database.GetName()) (List.length records) + records // Generate a list of species and assembies for the given characater, and write // them to a file. An integer acculmulator is used to ensure unique numerical // IDs for all distinct species. -let generateLists (fullList : GenBankRow list) (acc : int) (character : char) = +let generateLists (database : DatabaseName) (fullList : FileRow list) (acc : int) (character : char) = // Filter the full list of assemblies for only those that have an organism // name matching the current character. let filteredList = List.filter (fun row -> (getLookupCharacter row.organism_name).Equals(character)) fullList // Generate the lists of species and assemblies for the given character. let speciesList = (getSpeciesList filteredList acc) - let assemblyList = (getAssemblyList filteredList speciesList) + let assemblyList = (getAssemblyList database filteredList speciesList) // Generate the filenames for the species and assembly files. - let speciesFilename = $"./build/data/genbank-species-{character}.txt" - let assemblyFilename = $"./build/data/genbank-assemblies-{character}.txt" + let speciesFilename = $"./build/data/{(database.GetName().ToLower())}-species-{character}.txt" + let assemblyFilename = $"./build/data/{(database.GetName().ToLower())}-assemblies-{character}.txt" // Write the species entries to a file. let speciesWriter = new StreamWriter(speciesFilename) @@ -234,6 +263,27 @@ let generateLists (fullList : GenBankRow list) (acc : int) (character : char) = // correct number for the next character. acc + List.length speciesList -printfn "------ Generating new lists from loaded GenBank assembly list... ------" -printfn "------ Successfully generated lists for %i species. ------" (Seq.fold (generateLists records) 0 characters) +// Handles the operations for GenBank. +let generateGenBankLists () = + let database = GenBank + printfn "------ Creating lists for %s ------" (database.GetName()) + let records = getFtpList database + printfn "Generating new lists from loaded %s assembly list..." (database.GetName()) + printfn "Generated lists for %i species." (Seq.fold (generateLists database records) 0 characters) + printfn "------ %s operations successful. ------" (database.GetName()) + +// Handles the operations for RefSeq. +let generateRefSeqLists () = + let database = RefSeq + printfn "------ Creating lists for %s ------" (database.GetName()) + let records = getFtpList database + printfn "Generating new lists from loaded %s assembly list..." (database.GetName()) + printfn "Generated lists for %i species." (Seq.fold (generateLists database records) 0 characters) + printfn "------ %s operations successful. ------" (database.GetName()) + +// ------ Main program ------ + +printfn "------------ Starting operations to generate GenBank and RefSeq data file lists for BioProviders. ------------" +generateGenBankLists() +generateRefSeqLists() printfn "------------ All operations completed. ------------" \ No newline at end of file diff --git a/build/data/genbank-assemblies-#.txt.gz b/build/data/genbank-assemblies-#.txt.gz index f4ddb09..02da8f3 100644 Binary files a/build/data/genbank-assemblies-#.txt.gz and b/build/data/genbank-assemblies-#.txt.gz differ diff --git a/build/data/genbank-assemblies-a.txt.gz b/build/data/genbank-assemblies-a.txt.gz index 43bed95..7c34f59 100644 Binary files a/build/data/genbank-assemblies-a.txt.gz and b/build/data/genbank-assemblies-a.txt.gz differ diff --git a/build/data/genbank-assemblies-b.txt.gz b/build/data/genbank-assemblies-b.txt.gz index 60d56fa..816f645 100644 Binary files a/build/data/genbank-assemblies-b.txt.gz and b/build/data/genbank-assemblies-b.txt.gz differ diff --git a/build/data/genbank-assemblies-c.txt.gz b/build/data/genbank-assemblies-c.txt.gz index 6408aca..f2d4ddb 100644 Binary files a/build/data/genbank-assemblies-c.txt.gz and b/build/data/genbank-assemblies-c.txt.gz differ diff --git a/build/data/genbank-assemblies-d.txt.gz b/build/data/genbank-assemblies-d.txt.gz index 07b890a..924be2d 100644 Binary files a/build/data/genbank-assemblies-d.txt.gz and b/build/data/genbank-assemblies-d.txt.gz differ diff --git a/build/data/genbank-assemblies-e.txt.gz b/build/data/genbank-assemblies-e.txt.gz index 9950a6a..28b9749 100644 Binary files a/build/data/genbank-assemblies-e.txt.gz and b/build/data/genbank-assemblies-e.txt.gz differ diff --git a/build/data/genbank-assemblies-f.txt.gz b/build/data/genbank-assemblies-f.txt.gz index b13795d..3739573 100644 Binary files a/build/data/genbank-assemblies-f.txt.gz and b/build/data/genbank-assemblies-f.txt.gz differ diff --git a/build/data/genbank-assemblies-g.txt.gz b/build/data/genbank-assemblies-g.txt.gz index 08bb222..42f4626 100644 Binary files a/build/data/genbank-assemblies-g.txt.gz and b/build/data/genbank-assemblies-g.txt.gz differ diff --git a/build/data/genbank-assemblies-h.txt.gz b/build/data/genbank-assemblies-h.txt.gz index 65bd5bc..deb2189 100644 Binary files a/build/data/genbank-assemblies-h.txt.gz and b/build/data/genbank-assemblies-h.txt.gz differ diff --git a/build/data/genbank-assemblies-i.txt.gz b/build/data/genbank-assemblies-i.txt.gz index 38c6f4d..576f612 100644 Binary files a/build/data/genbank-assemblies-i.txt.gz and b/build/data/genbank-assemblies-i.txt.gz differ diff --git a/build/data/genbank-assemblies-j.txt.gz b/build/data/genbank-assemblies-j.txt.gz index c960141..d6e65e9 100644 Binary files a/build/data/genbank-assemblies-j.txt.gz and b/build/data/genbank-assemblies-j.txt.gz differ diff --git a/build/data/genbank-assemblies-k.txt.gz b/build/data/genbank-assemblies-k.txt.gz index 15023ea..4539bae 100644 Binary files a/build/data/genbank-assemblies-k.txt.gz and b/build/data/genbank-assemblies-k.txt.gz differ diff --git a/build/data/genbank-assemblies-l.txt.gz b/build/data/genbank-assemblies-l.txt.gz index 150d950..c078b51 100644 Binary files a/build/data/genbank-assemblies-l.txt.gz and b/build/data/genbank-assemblies-l.txt.gz differ diff --git a/build/data/genbank-assemblies-m.txt.gz b/build/data/genbank-assemblies-m.txt.gz index 63b59a6..d3c9aff 100644 Binary files a/build/data/genbank-assemblies-m.txt.gz and b/build/data/genbank-assemblies-m.txt.gz differ diff --git a/build/data/genbank-assemblies-n.txt.gz b/build/data/genbank-assemblies-n.txt.gz index 82a3f7b..4299b7f 100644 Binary files a/build/data/genbank-assemblies-n.txt.gz and b/build/data/genbank-assemblies-n.txt.gz differ diff --git a/build/data/genbank-assemblies-o.txt.gz b/build/data/genbank-assemblies-o.txt.gz index 0f5a00d..1c6fe9d 100644 Binary files a/build/data/genbank-assemblies-o.txt.gz and b/build/data/genbank-assemblies-o.txt.gz differ diff --git a/build/data/genbank-assemblies-p.txt.gz b/build/data/genbank-assemblies-p.txt.gz index 38bdc12..1efdd97 100644 Binary files a/build/data/genbank-assemblies-p.txt.gz and b/build/data/genbank-assemblies-p.txt.gz differ diff --git a/build/data/genbank-assemblies-q.txt.gz b/build/data/genbank-assemblies-q.txt.gz index ec55ede..ffde345 100644 Binary files a/build/data/genbank-assemblies-q.txt.gz and b/build/data/genbank-assemblies-q.txt.gz differ diff --git a/build/data/genbank-assemblies-r.txt.gz b/build/data/genbank-assemblies-r.txt.gz index b1e14a9..5511fa6 100644 Binary files a/build/data/genbank-assemblies-r.txt.gz and b/build/data/genbank-assemblies-r.txt.gz differ diff --git a/build/data/genbank-assemblies-s.txt.gz b/build/data/genbank-assemblies-s.txt.gz index bb2a0d2..853adc2 100644 Binary files a/build/data/genbank-assemblies-s.txt.gz and b/build/data/genbank-assemblies-s.txt.gz differ diff --git a/build/data/genbank-assemblies-t.txt.gz b/build/data/genbank-assemblies-t.txt.gz index 58b1026..1aa5f87 100644 Binary files a/build/data/genbank-assemblies-t.txt.gz and b/build/data/genbank-assemblies-t.txt.gz differ diff --git a/build/data/genbank-assemblies-u.txt.gz b/build/data/genbank-assemblies-u.txt.gz index f1b7927..d43d26d 100644 Binary files a/build/data/genbank-assemblies-u.txt.gz and b/build/data/genbank-assemblies-u.txt.gz differ diff --git a/build/data/genbank-assemblies-v.txt.gz b/build/data/genbank-assemblies-v.txt.gz index 8e92784..d63ec68 100644 Binary files a/build/data/genbank-assemblies-v.txt.gz and b/build/data/genbank-assemblies-v.txt.gz differ diff --git a/build/data/genbank-assemblies-w.txt.gz b/build/data/genbank-assemblies-w.txt.gz index bedec15..dcd724e 100644 Binary files a/build/data/genbank-assemblies-w.txt.gz and b/build/data/genbank-assemblies-w.txt.gz differ diff --git a/build/data/genbank-assemblies-x.txt.gz b/build/data/genbank-assemblies-x.txt.gz index d295ec9..38d9731 100644 Binary files a/build/data/genbank-assemblies-x.txt.gz and b/build/data/genbank-assemblies-x.txt.gz differ diff --git a/build/data/genbank-assemblies-y.txt.gz b/build/data/genbank-assemblies-y.txt.gz index fad3fd9..0cebdfd 100644 Binary files a/build/data/genbank-assemblies-y.txt.gz and b/build/data/genbank-assemblies-y.txt.gz differ diff --git a/build/data/genbank-assemblies-z.txt.gz b/build/data/genbank-assemblies-z.txt.gz index b53e157..5f0e41c 100644 Binary files a/build/data/genbank-assemblies-z.txt.gz and b/build/data/genbank-assemblies-z.txt.gz differ diff --git a/build/data/genbank-species-#.txt.gz b/build/data/genbank-species-#.txt.gz index f053260..ff87fba 100644 Binary files a/build/data/genbank-species-#.txt.gz and b/build/data/genbank-species-#.txt.gz differ diff --git a/build/data/genbank-species-a.txt.gz b/build/data/genbank-species-a.txt.gz index 9eda4cf..30bebac 100644 Binary files a/build/data/genbank-species-a.txt.gz and b/build/data/genbank-species-a.txt.gz differ diff --git a/build/data/genbank-species-b.txt.gz b/build/data/genbank-species-b.txt.gz index 1dc2654..d076947 100644 Binary files a/build/data/genbank-species-b.txt.gz and b/build/data/genbank-species-b.txt.gz differ diff --git a/build/data/genbank-species-c.txt.gz b/build/data/genbank-species-c.txt.gz index 849c055..5b5d38b 100644 Binary files a/build/data/genbank-species-c.txt.gz and b/build/data/genbank-species-c.txt.gz differ diff --git a/build/data/genbank-species-d.txt.gz b/build/data/genbank-species-d.txt.gz index 4cb8b73..32d4c59 100644 Binary files a/build/data/genbank-species-d.txt.gz and b/build/data/genbank-species-d.txt.gz differ diff --git a/build/data/genbank-species-e.txt.gz b/build/data/genbank-species-e.txt.gz index c6653ba..1d917c0 100644 Binary files a/build/data/genbank-species-e.txt.gz and b/build/data/genbank-species-e.txt.gz differ diff --git a/build/data/genbank-species-f.txt.gz b/build/data/genbank-species-f.txt.gz index 143a3d4..865f78e 100644 Binary files a/build/data/genbank-species-f.txt.gz and b/build/data/genbank-species-f.txt.gz differ diff --git a/build/data/genbank-species-g.txt.gz b/build/data/genbank-species-g.txt.gz index d16d458..3742158 100644 Binary files a/build/data/genbank-species-g.txt.gz and b/build/data/genbank-species-g.txt.gz differ diff --git a/build/data/genbank-species-h.txt.gz b/build/data/genbank-species-h.txt.gz index 1c85044..f73240b 100644 Binary files a/build/data/genbank-species-h.txt.gz and b/build/data/genbank-species-h.txt.gz differ diff --git a/build/data/genbank-species-i.txt.gz b/build/data/genbank-species-i.txt.gz index 238fdad..a0aa310 100644 Binary files a/build/data/genbank-species-i.txt.gz and b/build/data/genbank-species-i.txt.gz differ diff --git a/build/data/genbank-species-j.txt.gz b/build/data/genbank-species-j.txt.gz index 04e4127..a6b2c1b 100644 Binary files a/build/data/genbank-species-j.txt.gz and b/build/data/genbank-species-j.txt.gz differ diff --git a/build/data/genbank-species-k.txt.gz b/build/data/genbank-species-k.txt.gz index ec25d90..36aefe4 100644 Binary files a/build/data/genbank-species-k.txt.gz and b/build/data/genbank-species-k.txt.gz differ diff --git a/build/data/genbank-species-l.txt.gz b/build/data/genbank-species-l.txt.gz index 14f2e0f..5bbbb3a 100644 Binary files a/build/data/genbank-species-l.txt.gz and b/build/data/genbank-species-l.txt.gz differ diff --git a/build/data/genbank-species-m.txt.gz b/build/data/genbank-species-m.txt.gz index d4054c6..539e6c6 100644 Binary files a/build/data/genbank-species-m.txt.gz and b/build/data/genbank-species-m.txt.gz differ diff --git a/build/data/genbank-species-n.txt.gz b/build/data/genbank-species-n.txt.gz index d266bb3..cf088aa 100644 Binary files a/build/data/genbank-species-n.txt.gz and b/build/data/genbank-species-n.txt.gz differ diff --git a/build/data/genbank-species-o.txt.gz b/build/data/genbank-species-o.txt.gz index a78dd31..75217cc 100644 Binary files a/build/data/genbank-species-o.txt.gz and b/build/data/genbank-species-o.txt.gz differ diff --git a/build/data/genbank-species-p.txt.gz b/build/data/genbank-species-p.txt.gz index dbeaac8..440df5a 100644 Binary files a/build/data/genbank-species-p.txt.gz and b/build/data/genbank-species-p.txt.gz differ diff --git a/build/data/genbank-species-q.txt.gz b/build/data/genbank-species-q.txt.gz index c4d384c..0a79551 100644 Binary files a/build/data/genbank-species-q.txt.gz and b/build/data/genbank-species-q.txt.gz differ diff --git a/build/data/genbank-species-r.txt.gz b/build/data/genbank-species-r.txt.gz index acb1042..ea46546 100644 Binary files a/build/data/genbank-species-r.txt.gz and b/build/data/genbank-species-r.txt.gz differ diff --git a/build/data/genbank-species-s.txt.gz b/build/data/genbank-species-s.txt.gz index 7a198cc..d03ad0b 100644 Binary files a/build/data/genbank-species-s.txt.gz and b/build/data/genbank-species-s.txt.gz differ diff --git a/build/data/genbank-species-t.txt.gz b/build/data/genbank-species-t.txt.gz index adf71e5..1df0b19 100644 Binary files a/build/data/genbank-species-t.txt.gz and b/build/data/genbank-species-t.txt.gz differ diff --git a/build/data/genbank-species-u.txt.gz b/build/data/genbank-species-u.txt.gz index a70bd3d..2b47b03 100644 Binary files a/build/data/genbank-species-u.txt.gz and b/build/data/genbank-species-u.txt.gz differ diff --git a/build/data/genbank-species-v.txt.gz b/build/data/genbank-species-v.txt.gz index 9f09cba..0850258 100644 Binary files a/build/data/genbank-species-v.txt.gz and b/build/data/genbank-species-v.txt.gz differ diff --git a/build/data/genbank-species-w.txt.gz b/build/data/genbank-species-w.txt.gz index a01afbb..7e36dc7 100644 Binary files a/build/data/genbank-species-w.txt.gz and b/build/data/genbank-species-w.txt.gz differ diff --git a/build/data/genbank-species-x.txt.gz b/build/data/genbank-species-x.txt.gz index c216265..52e6cad 100644 Binary files a/build/data/genbank-species-x.txt.gz and b/build/data/genbank-species-x.txt.gz differ diff --git a/build/data/genbank-species-y.txt.gz b/build/data/genbank-species-y.txt.gz index 692d972..52d578f 100644 Binary files a/build/data/genbank-species-y.txt.gz and b/build/data/genbank-species-y.txt.gz differ diff --git a/build/data/genbank-species-z.txt.gz b/build/data/genbank-species-z.txt.gz index 3d5e134..e02a465 100644 Binary files a/build/data/genbank-species-z.txt.gz and b/build/data/genbank-species-z.txt.gz differ diff --git a/build/data/refseq-assemblies-#.txt.gz b/build/data/refseq-assemblies-#.txt.gz new file mode 100644 index 0000000..615e43d Binary files /dev/null and b/build/data/refseq-assemblies-#.txt.gz differ diff --git a/build/data/refseq-assemblies-a.txt.gz b/build/data/refseq-assemblies-a.txt.gz new file mode 100644 index 0000000..9799b73 Binary files /dev/null and b/build/data/refseq-assemblies-a.txt.gz differ diff --git a/build/data/refseq-assemblies-b.txt.gz b/build/data/refseq-assemblies-b.txt.gz new file mode 100644 index 0000000..dc45bc3 Binary files /dev/null and b/build/data/refseq-assemblies-b.txt.gz differ diff --git a/build/data/refseq-assemblies-c.txt.gz b/build/data/refseq-assemblies-c.txt.gz new file mode 100644 index 0000000..b056953 Binary files /dev/null and b/build/data/refseq-assemblies-c.txt.gz differ diff --git a/build/data/refseq-assemblies-d.txt.gz b/build/data/refseq-assemblies-d.txt.gz new file mode 100644 index 0000000..324bc01 Binary files /dev/null and b/build/data/refseq-assemblies-d.txt.gz differ diff --git a/build/data/refseq-assemblies-e.txt.gz b/build/data/refseq-assemblies-e.txt.gz new file mode 100644 index 0000000..7360596 Binary files /dev/null and b/build/data/refseq-assemblies-e.txt.gz differ diff --git a/build/data/refseq-assemblies-f.txt.gz b/build/data/refseq-assemblies-f.txt.gz new file mode 100644 index 0000000..62fd4b5 Binary files /dev/null and b/build/data/refseq-assemblies-f.txt.gz differ diff --git a/build/data/refseq-assemblies-g.txt.gz b/build/data/refseq-assemblies-g.txt.gz new file mode 100644 index 0000000..09907c0 Binary files /dev/null and b/build/data/refseq-assemblies-g.txt.gz differ diff --git a/build/data/refseq-assemblies-h.txt.gz b/build/data/refseq-assemblies-h.txt.gz new file mode 100644 index 0000000..48246c5 Binary files /dev/null and b/build/data/refseq-assemblies-h.txt.gz differ diff --git a/build/data/refseq-assemblies-i.txt.gz b/build/data/refseq-assemblies-i.txt.gz new file mode 100644 index 0000000..23621ea Binary files /dev/null and b/build/data/refseq-assemblies-i.txt.gz differ diff --git a/build/data/refseq-assemblies-j.txt.gz b/build/data/refseq-assemblies-j.txt.gz new file mode 100644 index 0000000..c0ee925 Binary files /dev/null and b/build/data/refseq-assemblies-j.txt.gz differ diff --git a/build/data/refseq-assemblies-k.txt.gz b/build/data/refseq-assemblies-k.txt.gz new file mode 100644 index 0000000..8234b62 Binary files /dev/null and b/build/data/refseq-assemblies-k.txt.gz differ diff --git a/build/data/refseq-assemblies-l.txt.gz b/build/data/refseq-assemblies-l.txt.gz new file mode 100644 index 0000000..deeca06 Binary files /dev/null and b/build/data/refseq-assemblies-l.txt.gz differ diff --git a/build/data/refseq-assemblies-m.txt.gz b/build/data/refseq-assemblies-m.txt.gz new file mode 100644 index 0000000..c0f4e20 Binary files /dev/null and b/build/data/refseq-assemblies-m.txt.gz differ diff --git a/build/data/refseq-assemblies-n.txt.gz b/build/data/refseq-assemblies-n.txt.gz new file mode 100644 index 0000000..497c901 Binary files /dev/null and b/build/data/refseq-assemblies-n.txt.gz differ diff --git a/build/data/refseq-assemblies-o.txt.gz b/build/data/refseq-assemblies-o.txt.gz new file mode 100644 index 0000000..5e905f0 Binary files /dev/null and b/build/data/refseq-assemblies-o.txt.gz differ diff --git a/build/data/refseq-assemblies-p.txt.gz b/build/data/refseq-assemblies-p.txt.gz new file mode 100644 index 0000000..ea1cd01 Binary files /dev/null and b/build/data/refseq-assemblies-p.txt.gz differ diff --git a/build/data/refseq-assemblies-q.txt.gz b/build/data/refseq-assemblies-q.txt.gz new file mode 100644 index 0000000..4b0e059 Binary files /dev/null and b/build/data/refseq-assemblies-q.txt.gz differ diff --git a/build/data/refseq-assemblies-r.txt.gz b/build/data/refseq-assemblies-r.txt.gz new file mode 100644 index 0000000..229d59b Binary files /dev/null and b/build/data/refseq-assemblies-r.txt.gz differ diff --git a/build/data/refseq-assemblies-s.txt.gz b/build/data/refseq-assemblies-s.txt.gz new file mode 100644 index 0000000..445cc93 Binary files /dev/null and b/build/data/refseq-assemblies-s.txt.gz differ diff --git a/build/data/refseq-assemblies-t.txt.gz b/build/data/refseq-assemblies-t.txt.gz new file mode 100644 index 0000000..4248440 Binary files /dev/null and b/build/data/refseq-assemblies-t.txt.gz differ diff --git a/build/data/refseq-assemblies-u.txt.gz b/build/data/refseq-assemblies-u.txt.gz new file mode 100644 index 0000000..013364b Binary files /dev/null and b/build/data/refseq-assemblies-u.txt.gz differ diff --git a/build/data/refseq-assemblies-v.txt.gz b/build/data/refseq-assemblies-v.txt.gz new file mode 100644 index 0000000..f624d7a Binary files /dev/null and b/build/data/refseq-assemblies-v.txt.gz differ diff --git a/build/data/refseq-assemblies-w.txt.gz b/build/data/refseq-assemblies-w.txt.gz new file mode 100644 index 0000000..3b494e5 Binary files /dev/null and b/build/data/refseq-assemblies-w.txt.gz differ diff --git a/build/data/refseq-assemblies-x.txt.gz b/build/data/refseq-assemblies-x.txt.gz new file mode 100644 index 0000000..262e88d Binary files /dev/null and b/build/data/refseq-assemblies-x.txt.gz differ diff --git a/build/data/refseq-assemblies-y.txt.gz b/build/data/refseq-assemblies-y.txt.gz new file mode 100644 index 0000000..7c2523c Binary files /dev/null and b/build/data/refseq-assemblies-y.txt.gz differ diff --git a/build/data/refseq-assemblies-z.txt.gz b/build/data/refseq-assemblies-z.txt.gz new file mode 100644 index 0000000..9369d47 Binary files /dev/null and b/build/data/refseq-assemblies-z.txt.gz differ diff --git a/build/data/refseq-species-#.txt.gz b/build/data/refseq-species-#.txt.gz new file mode 100644 index 0000000..fad7e72 Binary files /dev/null and b/build/data/refseq-species-#.txt.gz differ diff --git a/build/data/refseq-species-a.txt.gz b/build/data/refseq-species-a.txt.gz new file mode 100644 index 0000000..d33f28d Binary files /dev/null and b/build/data/refseq-species-a.txt.gz differ diff --git a/build/data/refseq-species-b.txt.gz b/build/data/refseq-species-b.txt.gz new file mode 100644 index 0000000..a774bfe Binary files /dev/null and b/build/data/refseq-species-b.txt.gz differ diff --git a/build/data/refseq-species-c.txt.gz b/build/data/refseq-species-c.txt.gz new file mode 100644 index 0000000..1c3a156 Binary files /dev/null and b/build/data/refseq-species-c.txt.gz differ diff --git a/build/data/refseq-species-d.txt.gz b/build/data/refseq-species-d.txt.gz new file mode 100644 index 0000000..900a79d Binary files /dev/null and b/build/data/refseq-species-d.txt.gz differ diff --git a/build/data/refseq-species-e.txt.gz b/build/data/refseq-species-e.txt.gz new file mode 100644 index 0000000..b82168b Binary files /dev/null and b/build/data/refseq-species-e.txt.gz differ diff --git a/build/data/refseq-species-f.txt.gz b/build/data/refseq-species-f.txt.gz new file mode 100644 index 0000000..a8d6fc4 Binary files /dev/null and b/build/data/refseq-species-f.txt.gz differ diff --git a/build/data/refseq-species-g.txt.gz b/build/data/refseq-species-g.txt.gz new file mode 100644 index 0000000..2dae290 Binary files /dev/null and b/build/data/refseq-species-g.txt.gz differ diff --git a/build/data/refseq-species-h.txt.gz b/build/data/refseq-species-h.txt.gz new file mode 100644 index 0000000..f14c490 Binary files /dev/null and b/build/data/refseq-species-h.txt.gz differ diff --git a/build/data/refseq-species-i.txt.gz b/build/data/refseq-species-i.txt.gz new file mode 100644 index 0000000..497caac Binary files /dev/null and b/build/data/refseq-species-i.txt.gz differ diff --git a/build/data/refseq-species-j.txt.gz b/build/data/refseq-species-j.txt.gz new file mode 100644 index 0000000..b892dcf Binary files /dev/null and b/build/data/refseq-species-j.txt.gz differ diff --git a/build/data/refseq-species-k.txt.gz b/build/data/refseq-species-k.txt.gz new file mode 100644 index 0000000..8b151d9 Binary files /dev/null and b/build/data/refseq-species-k.txt.gz differ diff --git a/build/data/refseq-species-l.txt.gz b/build/data/refseq-species-l.txt.gz new file mode 100644 index 0000000..2b45317 Binary files /dev/null and b/build/data/refseq-species-l.txt.gz differ diff --git a/build/data/refseq-species-m.txt.gz b/build/data/refseq-species-m.txt.gz new file mode 100644 index 0000000..40ada42 Binary files /dev/null and b/build/data/refseq-species-m.txt.gz differ diff --git a/build/data/refseq-species-n.txt.gz b/build/data/refseq-species-n.txt.gz new file mode 100644 index 0000000..fce96b7 Binary files /dev/null and b/build/data/refseq-species-n.txt.gz differ diff --git a/build/data/refseq-species-o.txt.gz b/build/data/refseq-species-o.txt.gz new file mode 100644 index 0000000..93addf9 Binary files /dev/null and b/build/data/refseq-species-o.txt.gz differ diff --git a/build/data/refseq-species-p.txt.gz b/build/data/refseq-species-p.txt.gz new file mode 100644 index 0000000..111620f Binary files /dev/null and b/build/data/refseq-species-p.txt.gz differ diff --git a/build/data/refseq-species-q.txt.gz b/build/data/refseq-species-q.txt.gz new file mode 100644 index 0000000..83653ff Binary files /dev/null and b/build/data/refseq-species-q.txt.gz differ diff --git a/build/data/refseq-species-r.txt.gz b/build/data/refseq-species-r.txt.gz new file mode 100644 index 0000000..29d22b5 Binary files /dev/null and b/build/data/refseq-species-r.txt.gz differ diff --git a/build/data/refseq-species-s.txt.gz b/build/data/refseq-species-s.txt.gz new file mode 100644 index 0000000..b980f1b Binary files /dev/null and b/build/data/refseq-species-s.txt.gz differ diff --git a/build/data/refseq-species-t.txt.gz b/build/data/refseq-species-t.txt.gz new file mode 100644 index 0000000..4779ef5 Binary files /dev/null and b/build/data/refseq-species-t.txt.gz differ diff --git a/build/data/refseq-species-u.txt.gz b/build/data/refseq-species-u.txt.gz new file mode 100644 index 0000000..47f1e34 Binary files /dev/null and b/build/data/refseq-species-u.txt.gz differ diff --git a/build/data/refseq-species-v.txt.gz b/build/data/refseq-species-v.txt.gz new file mode 100644 index 0000000..a7d23be Binary files /dev/null and b/build/data/refseq-species-v.txt.gz differ diff --git a/build/data/refseq-species-w.txt.gz b/build/data/refseq-species-w.txt.gz new file mode 100644 index 0000000..a9d38fa Binary files /dev/null and b/build/data/refseq-species-w.txt.gz differ diff --git a/build/data/refseq-species-x.txt.gz b/build/data/refseq-species-x.txt.gz new file mode 100644 index 0000000..49a9b58 Binary files /dev/null and b/build/data/refseq-species-x.txt.gz differ diff --git a/build/data/refseq-species-y.txt.gz b/build/data/refseq-species-y.txt.gz new file mode 100644 index 0000000..3c5cbaa Binary files /dev/null and b/build/data/refseq-species-y.txt.gz differ diff --git a/build/data/refseq-species-z.txt.gz b/build/data/refseq-species-z.txt.gz new file mode 100644 index 0000000..d9b4d7e Binary files /dev/null and b/build/data/refseq-species-z.txt.gz differ diff --git a/src/DesignTime/Common.fs b/src/DesignTime/Common.fs index 80a6442..aea8395 100644 --- a/src/DesignTime/Common.fs +++ b/src/DesignTime/Common.fs @@ -85,6 +85,12 @@ module Context = | GenBank -> "/genomes/all/GCA" | RefSeq -> "/genomes/all/GCF" + // Returns the name of the database as a string. + override this.ToString() = + match this with + | GenBank -> "GenBank" + | RefSeq -> "RefSeq" + // ---------------------------------------------------------------------------------- // Species Types. @@ -222,254 +228,250 @@ type private ICache = module private CacheHelpers = - module General = - - let getCacheFilePath (path: string) = - let cacheLocation = Path.Combine(Path.GetTempPath(), "BioProviders") - let cacheFileName = path.Replace("/", " ").Trim().Replace(" ", "-") - Path.Combine(cacheLocation, cacheFileName) - - let loadFile (path: string) = - if File.Exists(path) then - Some(File.OpenRead(path)) - else - None - - let loadCacheFile (path: string) = - let cachePath = getCacheFilePath (path) - loadFile cachePath - - let saveCacheFile (path: string) = - let cachePath = getCacheFilePath (path) - FTP.downloadNCBIFile (cachePath, path) - - let clearCache () = - let cacheLocation = Path.Combine(Path.GetTempPath(), "BioProviders") - - if Directory.Exists cacheLocation then - let cacheFiles = Directory.GetFiles cacheLocation - Seq.iter (fun file -> File.Delete(file)) cacheFiles - - let clearCacheOld (days: float) = - let cutOffDate = System.DateTime.Now.AddDays(-days) - let cacheLocation = Path.Combine(Path.GetTempPath(), "BioProviders") - - if Directory.Exists cacheLocation then - let cacheFiles = Directory.GetFiles cacheLocation - - Seq.iter - (fun file -> - match File.GetLastAccessTime(file) < cutOffDate with - | true -> File.Delete(file) - | _ -> ()) - cacheFiles - - // Used to download an assembly list from a remote server. - let internal saveAssemblyList (path: string) = - let cachePath = getCacheFilePath (path) - - try - // At the moment, use the base URL for the raw .gz files in the - // BioProviders repository. - let url = - sprintf "https://github.com/fsprojects/BioProviders/raw/main/build/data/%s" path - - let data = Http.Request(url).Body - - match data with - | Binary bytes -> - File.WriteAllBytes(cachePath, bytes) - true - | _ -> - failwith ( - sprintf - "Could not download remote file %s to %s - did not recieve binary content." - path - cachePath - ) - - false - with ex -> + let private getCacheFilePath (path: string) = + let cacheLocation = Path.Combine(Path.GetTempPath(), "BioProviders") + let cacheFileName = path.Replace("/", " ").Trim().Replace(" ", "-") + Path.Combine(cacheLocation, cacheFileName) + + let private loadFile (path: string) = + if File.Exists(path) then + Some(File.OpenRead(path)) + else + None + + let internal loadCacheFile (path: string) = + let cachePath = getCacheFilePath (path) + loadFile cachePath + + let internal saveCacheFile (path: string) = + let cachePath = getCacheFilePath (path) + FTP.downloadNCBIFile (cachePath, path) + + let internal clearCache () = + let cacheLocation = Path.Combine(Path.GetTempPath(), "BioProviders") + + if Directory.Exists cacheLocation then + let cacheFiles = Directory.GetFiles cacheLocation + Seq.iter (fun file -> File.Delete(file)) cacheFiles + + let internal clearCacheOld (days: float) = + let cutOffDate = System.DateTime.Now.AddDays(-days) + let cacheLocation = Path.Combine(Path.GetTempPath(), "BioProviders") + + if Directory.Exists cacheLocation then + let cacheFiles = Directory.GetFiles cacheLocation + + Seq.iter + (fun file -> + match File.GetLastAccessTime(file) < cutOffDate with + | true -> File.Delete(file) + | _ -> ()) + cacheFiles + + // Used to download an assembly list from a remote server. + let private saveAssemblyList (path: string) = + let cachePath = getCacheFilePath (path) + + try + // At the moment, use the base URL for the raw .gz files in the + // BioProviders repository. + let url = + sprintf "https://github.com/fsprojects/BioProviders/raw/main/build/data/%s" path + + let data = Http.Request(url).Body + + match data with + | Binary bytes -> + File.WriteAllBytes(cachePath, bytes) + true + | _ -> failwith ( - sprintf - "Could not download remote file %s to %s, because of the following exception: %s" - path - cachePath - ex.Message + sprintf "Could not download remote file %s to %s - did not recieve binary content." path cachePath ) false + with ex -> + failwith ( + sprintf + "Could not download remote file %s to %s, because of the following exception: %s" + path + cachePath + ex.Message + ) - // Used to load a data file referring to the location of assemblies on - // GenBank's FTP server. If the file does not exist in the cache - // location, attempts to download it from the FTP server (with the - // above function). - let loadAssemblyList (path: string) = - - let fullPath = getCacheFilePath path - - // Read the existing file if the data file has already been - // downloaded. - if File.Exists(fullPath) then - Some(File.OpenRead(fullPath)) - else - match saveAssemblyList path with - | false -> None - | _ -> Some(File.OpenRead(fullPath)) - - module GenBank = - - let private getLookupCharacter (name: string) = - match name.Chars(0) with - | c when System.Char.IsLetter(c) -> c - | _ -> '#' - - let private getSpeciesLookupPath (speciesName: string) = - let character = getLookupCharacter speciesName - $"genbank-species-{character}.txt.gz" - - let private getAssemblyLookupPath (speciesName: string) = - let character = getLookupCharacter speciesName - $"genbank-assemblies-{character}.txt.gz" - - let private getSpeciesID (speciesName: string) = - let speciesLookupFile = getSpeciesLookupPath speciesName - - match General.loadAssemblyList speciesLookupFile with - | None -> invalidOp "Could not load species lookup file." - | Some data -> - data :> Stream - |> (fun stream -> new Compression.GZipStream(stream, Compression.CompressionMode.Decompress)) - |> (fun gzipStream -> - use stream = new StreamReader(gzipStream) - - let rec checkLine () = - if not stream.EndOfStream then - let line = stream.ReadLine() - let info = line.Split(',') - - if info.[1].ToLower() <> speciesName then - checkLine () - else - info.[0] - else - invalidOp "The species could not be found. Check the species name is correct." - - checkLine ()) + false - let parseAssemblyLine (database: DatabaseName) (assemblyLine: string) = - let assemblyInfo = assemblyLine.Split(',') - let accession = assemblyInfo.[1] - let assemblyPath = $"{database.GetPath()}/{assemblyInfo.[2]}" + // Used to load a data file referring to the location of assemblies on + // GenBank's FTP server. If the file does not exist in the cache + // location, attempts to download it from the FTP server (with the + // above function). + let private loadAssemblyList (path: string) = - let assemblyName = - assemblyPath.Split('/') |> (fun parts -> parts.[parts.Length - 1]) + let fullPath = getCacheFilePath path - (accession, assemblyName, assemblyPath) + // Read the existing file if the data file has already been + // downloaded. + if File.Exists(fullPath) then + Some(File.OpenRead(fullPath)) + else + match saveAssemblyList path with + | false -> None + | _ -> Some(File.OpenRead(fullPath)) + + let private getLookupCharacter (name: string) = + match name.Chars(0) with + | c when System.Char.IsLetter(c) -> c + | _ -> '#' + + let private getSpeciesLookupPath (database: DatabaseName) (speciesName: string) = + let character = getLookupCharacter speciesName + $"{(database.ToString()).ToLower()}-species-{character}.txt.gz" + + let private getAssemblyLookupPath (database: DatabaseName) (speciesName: string) = + let character = getLookupCharacter speciesName + $"{(database.ToString()).ToLower()}-assemblies-{character}.txt.gz" + + let private parseAssemblyLine (database: DatabaseName) (assemblyLine: string) = + let assemblyInfo = assemblyLine.Split(',') + let accession = assemblyInfo.[1] + let assemblyPath = $"{database.GetPath()}/{assemblyInfo.[2]}" + + let assemblyName = + assemblyPath.Split('/') |> (fun parts -> parts.[parts.Length - 1]) + + (accession, assemblyName, assemblyPath) + + let private getSpeciesID (database: DatabaseName) (speciesName: string) = + let speciesLookupFile = getSpeciesLookupPath database speciesName + + match loadAssemblyList speciesLookupFile with + | None -> invalidOp $"Could not load {database.ToString()} species lookup file." + | Some data -> + data :> Stream + |> (fun stream -> new Compression.GZipStream(stream, Compression.CompressionMode.Decompress)) + |> (fun gzipStream -> + use stream = new StreamReader(gzipStream) + + let rec checkLine () = + if not stream.EndOfStream then + let line = stream.ReadLine() + let info = line.Split(',') + + if info.[1].ToLower() <> speciesName then + checkLine () + else + info.[0] + else + invalidOp + $"The species could not be found. Check the species name is correct and it is a valid {database.ToString()} species." - let getAssembly (database: DatabaseName) (species: SpeciesName) (accession: AccessionName) = - let speciesID = getSpeciesID (species.ToString()) - let assemblyLookupFile = getAssemblyLookupPath (species.ToString()) + checkLine ()) - match General.loadAssemblyList assemblyLookupFile with - | None -> invalidOp "Could not load assembly lookup file." - | Some data -> - data :> Stream - |> (fun stream -> new Compression.GZipStream(stream, Compression.CompressionMode.Decompress)) - |> (fun gzipStream -> - use stream = new StreamReader(gzipStream) + let getAssembly (database: DatabaseName) (species: SpeciesName) (accession: AccessionName) = + let speciesID = getSpeciesID database (species.ToString()) + let assemblyLookupFile = getAssemblyLookupPath database (species.ToString()) + + match loadAssemblyList assemblyLookupFile with + | None -> invalidOp $"Could not load {database.ToString()} assembly lookup file." + | Some data -> + data :> Stream + |> (fun stream -> new Compression.GZipStream(stream, Compression.CompressionMode.Decompress)) + |> (fun gzipStream -> + use stream = new StreamReader(gzipStream) + + let rec checkLine () = + if not stream.EndOfStream then + let line = stream.ReadLine() + let info = line.Split(',') + + if info.[0] <> speciesID || info.[1].ToLower() <> (accession.ToString()) then + checkLine () + else + parseAssemblyLine database line + else + invalidOp + $"The assembly could not be found. Check that the accession is correct and it is a valid {database.ToString()} accession." - let rec checkLine () = - if not stream.EndOfStream then - let line = stream.ReadLine() - let info = line.Split(',') + checkLine ()) - if info.[0] <> speciesID || info.[1].ToLower() <> (accession.ToString()) then - checkLine () - else - parseAssemblyLine database line - else - invalidOp "The assembly could not be found. Check that the accession is correct." - - checkLine ()) - - let getAssemblies - (database: DatabaseName) - (assemblyLookupPath: string) - (speciesID: string) - (accessionPattern: string) - = - match General.loadAssemblyList assemblyLookupPath with - | None -> invalidOp "Could not load assembly lookup file." - | Some data -> - data :> Stream - |> (fun stream -> new Compression.GZipStream(stream, Compression.CompressionMode.Decompress)) - |> (fun gzipStream -> - use stream = new StreamReader(gzipStream) - - let rec checkLine (assemblies: (string * string * string) list) = - if not (stream.EndOfStream && assemblies.Length = 0) then - let line = stream.ReadLine() - let info = line.Split(',') - - if - info.[0] <> speciesID - || not (Regex.IsMatch(info.[1].ToLower(), accessionPattern)) - then - if assemblies.Length = 0 then - checkLine assemblies - else - assemblies + let getAssemblyCollection + (database: DatabaseName) + (assemblyLookupPath: string) + (speciesID: string) + (accessionPattern: string) + = + match loadAssemblyList assemblyLookupPath with + | None -> invalidOp $"Could not load {database.ToString()} assembly lookup file." + | Some data -> + data :> Stream + |> (fun stream -> new Compression.GZipStream(stream, Compression.CompressionMode.Decompress)) + |> (fun gzipStream -> + use stream = new StreamReader(gzipStream) + + let rec checkLine (assemblies: (string * string * string) list) = + if not (stream.EndOfStream && assemblies.Length = 0) then + let line = stream.ReadLine() + let info = line.Split(',') + + if + info.[0] <> speciesID + || not (Regex.IsMatch(info.[1].ToLower(), accessionPattern)) + then + if assemblies.Length = 0 then + checkLine assemblies else - assemblies @ [ parseAssemblyLine database line ] |> checkLine + assemblies else - invalidOp - "No assemblies matching the accession pattern could be found. Check the accession pattern is correct." + assemblies @ [ parseAssemblyLine database line ] |> checkLine + else + invalidOp + "No assemblies matching the accession pattern could be found. Check the accession pattern is correct." - checkLine []) + checkLine []) - let getSpecies (species: SpeciesName) = - let speciesName = species.ToString() - let speciesID = getSpeciesID (speciesName) - let assemblyLookupFile = getAssemblyLookupPath (speciesName) - - (speciesID, speciesName, assemblyLookupFile) - - let getSpeciesCollection (speciesPattern: string) = - let speciesLookupPath = getSpeciesLookupPath speciesPattern - let assemblyLookupPath = getAssemblyLookupPath speciesPattern - - match General.loadAssemblyList speciesLookupPath with - | None -> invalidOp "Could not load assembly lookup file." - | Some data -> - data :> Stream - |> (fun stream -> new Compression.GZipStream(stream, Compression.CompressionMode.Decompress)) - |> (fun gzipStream -> - use stream = new StreamReader(gzipStream) + let getSpecies (database: DatabaseName) (species: SpeciesName) = + let speciesName = species.ToString() + let speciesID = getSpeciesID database (speciesName) + let assemblyLookupFile = getAssemblyLookupPath database (speciesName) - let rec checkLine (species: (string * string * string) list) = - if not (stream.EndOfStream && species.Length = 0) then - let line = stream.ReadLine() - let info = line.Split(',') + (speciesID, speciesName, assemblyLookupFile) - if not (Regex.IsMatch(info.[1].ToLower(), speciesPattern)) then - if species.Length = 0 then checkLine species else species - else - let speciesID = info.[0] - let speciesName = info.[1] - species @ [ (speciesID, speciesName, assemblyLookupPath) ] |> checkLine + let getSpeciesCollection (database: DatabaseName) (speciesPattern: string) = + let speciesLookupPath = getSpeciesLookupPath database speciesPattern + let assemblyLookupPath = getAssemblyLookupPath database speciesPattern + + match loadAssemblyList speciesLookupPath with + | None -> invalidOp $"Could not load {database.ToString()} assembly lookup file." + | Some data -> + data :> Stream + |> (fun stream -> new Compression.GZipStream(stream, Compression.CompressionMode.Decompress)) + |> (fun gzipStream -> + use stream = new StreamReader(gzipStream) + + let rec checkLine (species: (string * string * string) list) = + if not (stream.EndOfStream && species.Length = 0) then + let line = stream.ReadLine() + let info = line.Split(',') + + if not (Regex.IsMatch(info.[1].ToLower(), speciesPattern)) then + if species.Length = 0 then checkLine species else species else - invalidOp - "No species matching the pattern could be found. Check the species pattern is correct." + let speciesID = info.[0] + let speciesName = info.[1] + species @ [ (speciesID, speciesName, assemblyLookupPath) ] |> checkLine + else + invalidOp + "No species matching the pattern could be found. Check the species pattern is correct." - checkLine []) + checkLine []) // -------------------------------------------------------------------------------------- // Cache Implementation. // -------------------------------------------------------------------------------------- -open CacheHelpers.General +//open CacheHelpers.General +open CacheHelpers type private Cache() = interface ICache with @@ -498,8 +500,8 @@ module CacheAccess = let getAssembly (database: DatabaseName) (species: SpeciesName) (accession: AccessionName) = match database with - | RefSeq _ -> failwith "RefSeq is not currently supported." - | GenBank _ -> CacheHelpers.GenBank.getAssembly database species accession + | RefSeq _ -> CacheHelpers.getAssembly database species accession //CacheHelpers.RefSeq.getAssembly database species accession //failwith "RefSeq is not currently supported." + | GenBank _ -> CacheHelpers.getAssembly database species accession //CacheHelpers.GenBank.getAssembly database species accession let getAssemblies (database: DatabaseName) @@ -508,18 +510,19 @@ module CacheAccess = (accessionPattern: string) = match database with - | RefSeq _ -> failwith "RefSeq is not currently supported." - | GenBank _ -> CacheHelpers.GenBank.getAssemblies database assemblyLookupPath speciesID accessionPattern + | RefSeq _ -> CacheHelpers.getAssemblyCollection database assemblyLookupPath speciesID accessionPattern //CacheHelpers.RefSeq.getAssemblies database assemblyLookupPath speciesID accessionPattern //failwith "RefSeq is not currently supported." + | GenBank _ -> CacheHelpers.getAssemblyCollection database assemblyLookupPath speciesID accessionPattern //CacheHelpers.GenBank.getAssemblies database assemblyLookupPath speciesID accessionPattern let getSpecies (database: DatabaseName) (species: SpeciesName) = match database with - | RefSeq _ -> failwith "RefSeq is not currently supported." - | GenBank _ -> CacheHelpers.GenBank.getSpecies species + | RefSeq _ -> CacheHelpers.getSpecies database species //CacheHelpers.RefSeq.getSpecies species //failwith "RefSeq is not currently supported." + | GenBank _ -> CacheHelpers.getSpecies database species //CacheHelpers.GenBank.getSpecies species let getSpeciesCollection (database: DatabaseName) (speciesPattern: string) = match database, speciesPattern with - | RefSeq _, _ -> failwith "RefSeq is not currently supported." + | RefSeq _, ".*" -> failwith "A species pattern is required." + | RefSeq _, _ -> CacheHelpers.getSpeciesCollection database speciesPattern //CacheHelpers.RefSeq.getSpeciesCollection speciesPattern //failwith "RefSeq is not currently supported." | GenBank _, ".*" -> failwith "A species pattern is required." - | GenBank _, _ -> CacheHelpers.GenBank.getSpeciesCollection speciesPattern + | GenBank _, _ -> CacheHelpers.getSpeciesCollection database speciesPattern //CacheHelpers.GenBank.getSpeciesCollection speciesPattern let deleteOldFiles = (new Cache() :> ICache).PurgeOld(90) diff --git a/src/DesignTime/DesignTime.fs b/src/DesignTime/DesignTime.fs index a5e3363..3338149 100644 --- a/src/DesignTime/DesignTime.fs +++ b/src/DesignTime/DesignTime.fs @@ -52,7 +52,7 @@ type public GenBankProvider(config: TypeProviderConfig) as this = // Add XML documentation to the Type Provider let assemblyHelpText = - """Typed representation of the GenBank FTP server. + """Typed representation of the NCBI FTP server, for GenBank data. The name of the species whose genome is being accessed (e.g. "Staphylococcus borealis"). Defaults to "". The accession of the genome assembly being accessed (e.g. "GCA_003042555.1"). Defaults to "".""" @@ -60,3 +60,57 @@ type public GenBankProvider(config: TypeProviderConfig) as this = // Register the main type with the Type Provider do this.AddNamespace(namespaceName, [ assemblyProvidedType ]) + +// RefSeq Type Provider. +[] +type public RefSeqProvider(config: TypeProviderConfig) as this = + + // Inherit basic Type Provider functionality and type construction. + inherit + TypeProviderForNamespaces( + config, + assemblyReplacementMap = [ ("BioProviders.DesignTime", "BioProviders") ], + addDefaultProbingLocation = true + ) + + // Define structure of the Type Provider + let namespaceName = "BioProviders" + let thisAssembly = Assembly.GetExecutingAssembly() + + let assemblyProvidedType = + ProvidedTypeDefinition(thisAssembly, namespaceName, "RefSeqProvider", Some typeof) + + // Instantiation function for parameterised Assembly Type Provider + let buildAssemblyType (typeName: string) (args: obj[]) = + + // Extract parameters + let species = args.[0] :?> string + let assembly = args.[1] :?> string + + // Define the assembly type + let providedType = + ProvidedTypeDefinition(thisAssembly, namespaceName, typeName, Some typeof) + + // Generate types + (species, assembly) + ||> Context.Parse + ||> Context.Create RefSeq + |> createType providedType + + // Define static parameters for the Type Provider + let assemblyParameters = + [ ProvidedStaticParameter("Species", typeof, parameterDefaultValue = "") + ProvidedStaticParameter("Accession", typeof, parameterDefaultValue = "") ] + + do assemblyProvidedType.DefineStaticParameters(assemblyParameters, buildAssemblyType) + + // Add XML documentation to the Type Provider + let assemblyHelpText = + """Typed representation of the NCBI FTP server, for RefSeq data. + The name of the species whose genome is being accessed (e.g. "Staphylococcus borealis"). Defaults to "". + The accession of the genome assembly being accessed (e.g. "GCF_001224225.1"). Defaults to "".""" + + do assemblyProvidedType.AddXmlDoc(assemblyHelpText) + + // Register the main type with the Type Provider + do this.AddNamespace(namespaceName, [ assemblyProvidedType ]) diff --git a/src/RunTime/RunTime.fsproj b/src/RunTime/RunTime.fsproj index f36d108..9de9802 100644 --- a/src/RunTime/RunTime.fsproj +++ b/src/RunTime/RunTime.fsproj @@ -24,9 +24,6 @@ all - - - <_PackageFiles Include="$(OutputPath)\BioProviders.DesignTime.dll">