diff --git a/.fantomasignore b/.fantomasignore index 9ac8718..c136f52 100644 --- a/.fantomasignore +++ b/.fantomasignore @@ -1,4 +1,2 @@ paket-files/ -tests/ -*.bak -Test.fsx \ No newline at end of file +tests/ \ No newline at end of file diff --git a/.gitignore b/.gitignore index 74c302f..f3a0651 100644 --- a/.gitignore +++ b/.gitignore @@ -360,10 +360,4 @@ output/ tmp/ # FAKE -.fake - -# Code backups -*.bak - -# Test script -Test.fsx \ No newline at end of file +.fake \ No newline at end of file diff --git a/DataFileGenerator.fsx b/DataFileGenerator.fsx new file mode 100644 index 0000000..b58fa38 --- /dev/null +++ b/DataFileGenerator.fsx @@ -0,0 +1,239 @@ +#r "nuget: CsvHelper" +#r "nuget: FluentFTP, 34.0.1" + +open CsvHelper +open System.IO +open CsvHelper.Configuration +open System.Globalization +open System.IO.Compression +open FluentFTP + +// ------ Record types used for reading and writing files ------ +// Rows for the original GenBank TSV file. +type GenBankRow = { + assembly_accession : string + bioproject : string + biosample : string + wgs_master : string + refseq_category : string + taxid : string + species_taxid : string + organism_name : string + infraspecific_name : string + isolate : string + version_status : string + assembly_level : string + release_type : string + genome_rep : string + seq_rel_date : string + asm_name : string + asm_submitter : string + gbrs_paired_asm : string + paired_asm_comp : string + ftp_path : string + excluded_from_refseq : string + relation_to_type_material : string + asm_not_live_date : string + assembly_type : string + group : string + genome_size : string + genome_size_ungapped : string + gc_percent : string + replicon_count : string + scaffold_count : string + contig_count : string + annotation_provider : string + annotation_name : string + annotation_date : string + total_gene_count : string + protein_coding_gene_count : string + non_coding_gene_count : string + pubmed_id : string +} + +// Rows for the generated assembly TSV file. +type AssemblyRow = { + species_id : string + assembly_accession : string + ftp_path : string +} + +// Rows for the generated species TSV file. +type SpeciesRow = { + species_id : string + species_name : string +} + +// Character array. +let characters = Seq.concat [['#']; ['a' .. 'z']] + +// Base URL for GenBank files on the FTP server. Used to delete the correct +// number of characters from the FTP path. +let genBankURL = "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/" + +// ------ Functions for generating and writing data files ------ +// Function for matching the first character of a species name. +// Characters that are not letters are treated as a '#'. +let getLookupCharacter (name: string) = + match name.Chars(0) with + | c when System.Char.IsLetter(c) -> System.Char.ToLower(c) + | _ -> '#' + +// Generate a list of distinct species with a unique species ID number for +// each, starting with the specified character. +let getSpeciesList (filteredList : GenBankRow list) (count : int) = + // Get a distinct list of species names. + // Also sorts it into alphabetical order. + let distinctList = List.sort (List.distinct (List.map (fun row -> row.organism_name) filteredList)) + // Return a list of SpeciesRows. + List.mapi (fun i name -> { species_id = (i + count).ToString() ; species_name = name }) distinctList + +// Generate a list of assemblies belonging to the species of a specified +// character, with the correct ID number for their species. +let getAssemblyList (filteredList : GenBankRow list) (speciesList : SpeciesRow list) = + // Function for finding a species name match for a certain row. + let findNameMatch row = List.tryFind (fun species -> species.species_name.Equals(row.organism_name)) speciesList + // Filter the CSV rows by those that have one of the organism names in the + // supplied list, and that have a FTP path that isn't "na". + let listWithPaths = List.filter (fun (row : GenBankRow) -> not (row.ftp_path.Equals("na"))) filteredList + // Function for sorting a list of AssemblyRows. It should be in the order + // of species IDs, and then the accessions if the IDs are the same. + let sortAssemblies (assembly1 : AssemblyRow) (assembly2: AssemblyRow) = + match assembly1.species_id.CompareTo(assembly2.species_id) with + | 0 -> assembly1.assembly_accession.CompareTo(assembly2.assembly_accession) + | result -> result + // Return a (sorted) list of AssemblyRows. + List.sortWith sortAssemblies (List.map (fun row -> { species_id = ((findNameMatch row).Value.species_id) ; assembly_accession = row.assembly_accession ; ftp_path = row.ftp_path.[(String.length genBankURL)..] } ) listWithPaths) + +// Compresses a written text file using GZip compression, writes it to a new +// file and deletes the original. +let compressFile (filename : string) = + let originalFile = File.OpenRead(filename) + let gZipFile = (new FileInfo(filename + ".gz")).Create() + let gZipStream = new GZipStream(gZipFile, CompressionMode.Compress) + originalFile.CopyTo(gZipStream) + gZipStream.Flush() + gZipStream.Close() + originalFile.Close() + File.Delete(filename) + +// ------ FTP functions ------ + +/// Creates and uses a connection with the NCBI FTP server. +let internal useNCBIConnection (callback) = + let serverBaseLocation = "ftp://ftp.ncbi.nlm.nih.gov" + use client = new FtpClient(serverBaseLocation) + client.Connect() + callback client + +// Checks if a file exists and if so, whether it is older than the remote +// file. +// - If a file doesn't exist, or is older: return to overwrite existing +// file. +// - Otherwise: return to resume existing file (in case it wasn't +// downloaded fully before). +let isNewerFile (localPath: string) (remotePath: string) (connection: FtpClient) = + if (not (File.Exists(localPath))) then + FtpLocalExists.Overwrite + else + match File.GetLastWriteTime(localPath) > connection.GetModifiedTime(remotePath) with + | true -> FtpLocalExists.Append + | _ -> FtpLocalExists.Overwrite + +/// Downloads a file from the NCBI FTP server to the local file system. +let downloadNCBIFile (localPath: string, remotePath: string) = + let downloadFile (connection: FtpClient) = + + // Check for changed file as well as verification. + connection.DownloadFile( + localPath, + remotePath, + (isNewerFile localPath remotePath connection), + FtpVerify.Retry + ) + + useNCBIConnection downloadFile + +let downloadedFilePath = (Path.Combine(Path.GetTempPath(), "BioProviders_Build", "downloaded_list.txt")) + +// ------ Main operations ------ + +printfn "------------ Starting operations to generate GenBank data file lists for BioProviders. ------------" + +printfn "------ Downloading GenBank summary file to %s...... ------" downloadedFilePath + +let status = downloadNCBIFile (downloadedFilePath, "/genomes/genbank/assembly_summary_genbank.txt") + +match status with + | FtpStatus.Failed -> failwith "------ Failed to download file from NCBI FTP server. ------" + | FtpStatus.Skipped -> printfn "------ File already downloaded. ------" + | _ -> printfn "------ File downloaded successfully. ------" + +printfn "------ Loading in GenBank assembly summary TSV... ------" + +// Load in the GenBank file. +(*let reader = new StreamReader("D:\\Users\\Samuel Smith_3\\Documents\\RA\\Downloads\\GenBank FTP\\assembly_summary_genbank_25-09-2023.txt")*) +let reader = new StreamReader(downloadedFilePath) + +// A function to skip lines that start with ##, to ignore the comment. +let skipFunction (args : ShouldSkipRecordArgs) = + args.Row[0].StartsWith("##") + +// Configuration for the CSV reader. It: +// - Chooses tab as the delimiter; +// - Sets the mode to no escape to ignore quotes; +// - Uses the above function to skip comment lines; and +// - Clear the # symbol on any headers. +let config = new CsvConfiguration(CultureInfo.InvariantCulture) +config.Delimiter <- "\t" +config.Mode <- CsvMode.NoEscape +config.ShouldSkipRecord <- new ShouldSkipRecord(skipFunction) +config.PrepareHeaderForMatch <- fun args -> args.Header.TrimStart('#') + +// Create a CSV reader object and get all records in the loaded file. +let csv = new CsvReader(reader, config) +let records = Seq.toList (csv.GetRecords()) + +// Show how many records were loaded. +printfn "Loaded %i records." (List.length records) +printfn "------ TSV loaded successfully. ------" + +// Generate a list of species and assembies for the given characater, and write +// them to a file. An integer acculmulator is used to ensure unique numerical +// IDs for all distinct species. +let generateLists (fullList : GenBankRow list) (acc : int) (character : char) = + // Filter the full list of assemblies for only those that have an organism + // name matching the current character. + let filteredList = List.filter (fun row -> (getLookupCharacter row.organism_name).Equals(character)) fullList + + // Generate the lists of species and assemblies for the given character. + let speciesList = (getSpeciesList filteredList acc) + let assemblyList = (getAssemblyList filteredList speciesList) + + // Generate the filenames for the species and assembly files. + let speciesFilename = $"./build/data/genbank-species-{character}.txt" + let assemblyFilename = $"./build/data/genbank-assemblies-{character}.txt" + + // Write the species entries to a file. + let speciesWriter = new StreamWriter(speciesFilename) + let speciesCsv = new CsvWriter(speciesWriter, CultureInfo.InvariantCulture) + speciesCsv.WriteRecords(speciesList) + speciesWriter.Flush() + speciesWriter.Close() + compressFile(speciesFilename) + + // Write the assembly entries to a file. + let assemblyWriter = new StreamWriter(assemblyFilename) + let assemblyCsv = new CsvWriter(assemblyWriter, CultureInfo.InvariantCulture) + assemblyCsv.WriteRecords(assemblyList) + assemblyWriter.Flush() + assemblyWriter.Close() + compressFile(assemblyFilename) + + // Add the number of new species to the acculmulator, to start at the + // correct number for the next character. + acc + List.length speciesList + +printfn "------ Generating new lists from loaded GenBank assembly list... ------" +printfn "------ Successfully generated lists for %i species. ------" (Seq.fold (generateLists records) 0 characters) +printfn "------------ All operations completed. ------------" \ No newline at end of file diff --git a/build/data/genbank-assemblies-#.txt.gz b/build/data/genbank-assemblies-#.txt.gz index c5aa8d2..f4ddb09 100644 Binary files a/build/data/genbank-assemblies-#.txt.gz and b/build/data/genbank-assemblies-#.txt.gz differ diff --git a/build/data/genbank-assemblies-a.txt.gz b/build/data/genbank-assemblies-a.txt.gz index eea28a7..43bed95 100644 Binary files a/build/data/genbank-assemblies-a.txt.gz and b/build/data/genbank-assemblies-a.txt.gz differ diff --git a/build/data/genbank-assemblies-b.txt.gz b/build/data/genbank-assemblies-b.txt.gz index 0fb2466..60d56fa 100644 Binary files a/build/data/genbank-assemblies-b.txt.gz and b/build/data/genbank-assemblies-b.txt.gz differ diff --git a/build/data/genbank-assemblies-c.txt.gz b/build/data/genbank-assemblies-c.txt.gz index c7fb055..6408aca 100644 Binary files a/build/data/genbank-assemblies-c.txt.gz and b/build/data/genbank-assemblies-c.txt.gz differ diff --git a/build/data/genbank-assemblies-d.txt.gz b/build/data/genbank-assemblies-d.txt.gz index 52b8115..07b890a 100644 Binary files a/build/data/genbank-assemblies-d.txt.gz and b/build/data/genbank-assemblies-d.txt.gz differ diff --git a/build/data/genbank-assemblies-e.txt.gz b/build/data/genbank-assemblies-e.txt.gz index 39398da..9950a6a 100644 Binary files a/build/data/genbank-assemblies-e.txt.gz and b/build/data/genbank-assemblies-e.txt.gz differ diff --git a/build/data/genbank-assemblies-f.txt.gz b/build/data/genbank-assemblies-f.txt.gz index 7c1b601..b13795d 100644 Binary files a/build/data/genbank-assemblies-f.txt.gz and b/build/data/genbank-assemblies-f.txt.gz differ diff --git a/build/data/genbank-assemblies-g.txt.gz b/build/data/genbank-assemblies-g.txt.gz index c806787..08bb222 100644 Binary files a/build/data/genbank-assemblies-g.txt.gz and b/build/data/genbank-assemblies-g.txt.gz differ diff --git a/build/data/genbank-assemblies-h.txt.gz b/build/data/genbank-assemblies-h.txt.gz index c813e63..65bd5bc 100644 Binary files a/build/data/genbank-assemblies-h.txt.gz and b/build/data/genbank-assemblies-h.txt.gz differ diff --git a/build/data/genbank-assemblies-i.txt.gz b/build/data/genbank-assemblies-i.txt.gz index 84e37a9..38c6f4d 100644 Binary files a/build/data/genbank-assemblies-i.txt.gz and b/build/data/genbank-assemblies-i.txt.gz differ diff --git a/build/data/genbank-assemblies-j.txt.gz b/build/data/genbank-assemblies-j.txt.gz index 4e335f3..c960141 100644 Binary files a/build/data/genbank-assemblies-j.txt.gz and b/build/data/genbank-assemblies-j.txt.gz differ diff --git a/build/data/genbank-assemblies-k.txt.gz b/build/data/genbank-assemblies-k.txt.gz index 5f4a966..15023ea 100644 Binary files a/build/data/genbank-assemblies-k.txt.gz and b/build/data/genbank-assemblies-k.txt.gz differ diff --git a/build/data/genbank-assemblies-l.txt.gz b/build/data/genbank-assemblies-l.txt.gz index c76e011..150d950 100644 Binary files a/build/data/genbank-assemblies-l.txt.gz and b/build/data/genbank-assemblies-l.txt.gz differ diff --git a/build/data/genbank-assemblies-m.txt.gz b/build/data/genbank-assemblies-m.txt.gz index 25ab583..63b59a6 100644 Binary files a/build/data/genbank-assemblies-m.txt.gz and b/build/data/genbank-assemblies-m.txt.gz differ diff --git a/build/data/genbank-assemblies-n.txt.gz b/build/data/genbank-assemblies-n.txt.gz index 4f0d4b0..82a3f7b 100644 Binary files a/build/data/genbank-assemblies-n.txt.gz and b/build/data/genbank-assemblies-n.txt.gz differ diff --git a/build/data/genbank-assemblies-o.txt.gz b/build/data/genbank-assemblies-o.txt.gz index 72192dc..0f5a00d 100644 Binary files a/build/data/genbank-assemblies-o.txt.gz and b/build/data/genbank-assemblies-o.txt.gz differ diff --git a/build/data/genbank-assemblies-p.txt.gz b/build/data/genbank-assemblies-p.txt.gz index 6c4ec60..38bdc12 100644 Binary files a/build/data/genbank-assemblies-p.txt.gz and b/build/data/genbank-assemblies-p.txt.gz differ diff --git a/build/data/genbank-assemblies-q.txt.gz b/build/data/genbank-assemblies-q.txt.gz index 024fdd4..ec55ede 100644 Binary files a/build/data/genbank-assemblies-q.txt.gz and b/build/data/genbank-assemblies-q.txt.gz differ diff --git a/build/data/genbank-assemblies-r.txt.gz b/build/data/genbank-assemblies-r.txt.gz index de0affd..b1e14a9 100644 Binary files a/build/data/genbank-assemblies-r.txt.gz and b/build/data/genbank-assemblies-r.txt.gz differ diff --git a/build/data/genbank-assemblies-s.txt.gz b/build/data/genbank-assemblies-s.txt.gz index 8aca9a4..bb2a0d2 100644 Binary files a/build/data/genbank-assemblies-s.txt.gz and b/build/data/genbank-assemblies-s.txt.gz differ diff --git a/build/data/genbank-assemblies-t.txt.gz b/build/data/genbank-assemblies-t.txt.gz index 4093ea1..58b1026 100644 Binary files a/build/data/genbank-assemblies-t.txt.gz and b/build/data/genbank-assemblies-t.txt.gz differ diff --git a/build/data/genbank-assemblies-u.txt.gz b/build/data/genbank-assemblies-u.txt.gz index c8d3b9b..f1b7927 100644 Binary files a/build/data/genbank-assemblies-u.txt.gz and b/build/data/genbank-assemblies-u.txt.gz differ diff --git a/build/data/genbank-assemblies-v.txt.gz b/build/data/genbank-assemblies-v.txt.gz index 3ce0829..8e92784 100644 Binary files a/build/data/genbank-assemblies-v.txt.gz and b/build/data/genbank-assemblies-v.txt.gz differ diff --git a/build/data/genbank-assemblies-w.txt.gz b/build/data/genbank-assemblies-w.txt.gz index 30cba9c..bedec15 100644 Binary files a/build/data/genbank-assemblies-w.txt.gz and b/build/data/genbank-assemblies-w.txt.gz differ diff --git a/build/data/genbank-assemblies-x.txt.gz b/build/data/genbank-assemblies-x.txt.gz index 8a7964a..d295ec9 100644 Binary files a/build/data/genbank-assemblies-x.txt.gz and b/build/data/genbank-assemblies-x.txt.gz differ diff --git a/build/data/genbank-assemblies-y.txt.gz b/build/data/genbank-assemblies-y.txt.gz index 8284328..fad3fd9 100644 Binary files a/build/data/genbank-assemblies-y.txt.gz and b/build/data/genbank-assemblies-y.txt.gz differ diff --git a/build/data/genbank-assemblies-z.txt.gz b/build/data/genbank-assemblies-z.txt.gz index 86e87cd..b53e157 100644 Binary files a/build/data/genbank-assemblies-z.txt.gz and b/build/data/genbank-assemblies-z.txt.gz differ diff --git a/build/data/genbank-species-#.txt.gz b/build/data/genbank-species-#.txt.gz index 18e9048..f053260 100644 Binary files a/build/data/genbank-species-#.txt.gz and b/build/data/genbank-species-#.txt.gz differ diff --git a/build/data/genbank-species-a.txt.gz b/build/data/genbank-species-a.txt.gz index 6dd6a96..9eda4cf 100644 Binary files a/build/data/genbank-species-a.txt.gz and b/build/data/genbank-species-a.txt.gz differ diff --git a/build/data/genbank-species-b.txt.gz b/build/data/genbank-species-b.txt.gz index 1211dd8..1dc2654 100644 Binary files a/build/data/genbank-species-b.txt.gz and b/build/data/genbank-species-b.txt.gz differ diff --git a/build/data/genbank-species-c.txt.gz b/build/data/genbank-species-c.txt.gz index 7ac398c..849c055 100644 Binary files a/build/data/genbank-species-c.txt.gz and b/build/data/genbank-species-c.txt.gz differ diff --git a/build/data/genbank-species-d.txt.gz b/build/data/genbank-species-d.txt.gz index 92b7499..4cb8b73 100644 Binary files a/build/data/genbank-species-d.txt.gz and b/build/data/genbank-species-d.txt.gz differ diff --git a/build/data/genbank-species-e.txt.gz b/build/data/genbank-species-e.txt.gz index 3d19b21..c6653ba 100644 Binary files a/build/data/genbank-species-e.txt.gz and b/build/data/genbank-species-e.txt.gz differ diff --git a/build/data/genbank-species-f.txt.gz b/build/data/genbank-species-f.txt.gz index b9f01a2..143a3d4 100644 Binary files a/build/data/genbank-species-f.txt.gz and b/build/data/genbank-species-f.txt.gz differ diff --git a/build/data/genbank-species-g.txt.gz b/build/data/genbank-species-g.txt.gz index 8fe4791..d16d458 100644 Binary files a/build/data/genbank-species-g.txt.gz and b/build/data/genbank-species-g.txt.gz differ diff --git a/build/data/genbank-species-h.txt.gz b/build/data/genbank-species-h.txt.gz index 04b55b8..1c85044 100644 Binary files a/build/data/genbank-species-h.txt.gz and b/build/data/genbank-species-h.txt.gz differ diff --git a/build/data/genbank-species-i.txt.gz b/build/data/genbank-species-i.txt.gz index b752ea9..238fdad 100644 Binary files a/build/data/genbank-species-i.txt.gz and b/build/data/genbank-species-i.txt.gz differ diff --git a/build/data/genbank-species-j.txt.gz b/build/data/genbank-species-j.txt.gz index 4ad3f31..04e4127 100644 Binary files a/build/data/genbank-species-j.txt.gz and b/build/data/genbank-species-j.txt.gz differ diff --git a/build/data/genbank-species-k.txt.gz b/build/data/genbank-species-k.txt.gz index 64bd986..ec25d90 100644 Binary files a/build/data/genbank-species-k.txt.gz and b/build/data/genbank-species-k.txt.gz differ diff --git a/build/data/genbank-species-l.txt.gz b/build/data/genbank-species-l.txt.gz index db51ff0..14f2e0f 100644 Binary files a/build/data/genbank-species-l.txt.gz and b/build/data/genbank-species-l.txt.gz differ diff --git a/build/data/genbank-species-m.txt.gz b/build/data/genbank-species-m.txt.gz index 67e8586..d4054c6 100644 Binary files a/build/data/genbank-species-m.txt.gz and b/build/data/genbank-species-m.txt.gz differ diff --git a/build/data/genbank-species-n.txt.gz b/build/data/genbank-species-n.txt.gz index 4c4a657..d266bb3 100644 Binary files a/build/data/genbank-species-n.txt.gz and b/build/data/genbank-species-n.txt.gz differ diff --git a/build/data/genbank-species-o.txt.gz b/build/data/genbank-species-o.txt.gz index 8574180..a78dd31 100644 Binary files a/build/data/genbank-species-o.txt.gz and b/build/data/genbank-species-o.txt.gz differ diff --git a/build/data/genbank-species-p.txt.gz b/build/data/genbank-species-p.txt.gz index acb7476..dbeaac8 100644 Binary files a/build/data/genbank-species-p.txt.gz and b/build/data/genbank-species-p.txt.gz differ diff --git a/build/data/genbank-species-q.txt.gz b/build/data/genbank-species-q.txt.gz index 60ce231..c4d384c 100644 Binary files a/build/data/genbank-species-q.txt.gz and b/build/data/genbank-species-q.txt.gz differ diff --git a/build/data/genbank-species-r.txt.gz b/build/data/genbank-species-r.txt.gz index b631b05..acb1042 100644 Binary files a/build/data/genbank-species-r.txt.gz and b/build/data/genbank-species-r.txt.gz differ diff --git a/build/data/genbank-species-s.txt.gz b/build/data/genbank-species-s.txt.gz index ea28272..7a198cc 100644 Binary files a/build/data/genbank-species-s.txt.gz and b/build/data/genbank-species-s.txt.gz differ diff --git a/build/data/genbank-species-t.txt.gz b/build/data/genbank-species-t.txt.gz index 37fa9c8..adf71e5 100644 Binary files a/build/data/genbank-species-t.txt.gz and b/build/data/genbank-species-t.txt.gz differ diff --git a/build/data/genbank-species-u.txt.gz b/build/data/genbank-species-u.txt.gz index 03585f8..a70bd3d 100644 Binary files a/build/data/genbank-species-u.txt.gz and b/build/data/genbank-species-u.txt.gz differ diff --git a/build/data/genbank-species-v.txt.gz b/build/data/genbank-species-v.txt.gz index 3b18b1a..9f09cba 100644 Binary files a/build/data/genbank-species-v.txt.gz and b/build/data/genbank-species-v.txt.gz differ diff --git a/build/data/genbank-species-w.txt.gz b/build/data/genbank-species-w.txt.gz index 4fa7f90..a01afbb 100644 Binary files a/build/data/genbank-species-w.txt.gz and b/build/data/genbank-species-w.txt.gz differ diff --git a/build/data/genbank-species-x.txt.gz b/build/data/genbank-species-x.txt.gz index 5781485..c216265 100644 Binary files a/build/data/genbank-species-x.txt.gz and b/build/data/genbank-species-x.txt.gz differ diff --git a/build/data/genbank-species-y.txt.gz b/build/data/genbank-species-y.txt.gz index 987daad..692d972 100644 Binary files a/build/data/genbank-species-y.txt.gz and b/build/data/genbank-species-y.txt.gz differ diff --git a/build/data/genbank-species-z.txt.gz b/build/data/genbank-species-z.txt.gz index 223ee65..3d5e134 100644 Binary files a/build/data/genbank-species-z.txt.gz and b/build/data/genbank-species-z.txt.gz differ diff --git a/src/DesignTime/Common.fs b/src/DesignTime/Common.fs index 6398af9..80a6442 100644 --- a/src/DesignTime/Common.fs +++ b/src/DesignTime/Common.fs @@ -301,23 +301,21 @@ module private CacheHelpers = false // Used to load a data file referring to the location of assemblies on - // GenBank's FTP server.. If the file does not exist in the cache + // GenBank's FTP server. If the file does not exist in the cache // location, attempts to download it from the FTP server (with the // above function). let loadAssemblyList (path: string) = - // This should be changed so we don't need to split up the path - // created to get the filename later. - let filename = Seq.last (path.Split('\\')) + let fullPath = getCacheFilePath path // Read the existing file if the data file has already been // downloaded. - if File.Exists(path) then - Some(File.OpenRead(path)) + if File.Exists(fullPath) then + Some(File.OpenRead(fullPath)) else - match saveAssemblyList filename with + match saveAssemblyList path with | false -> None - | _ -> Some(File.OpenRead(path)) + | _ -> Some(File.OpenRead(fullPath)) module GenBank = @@ -326,20 +324,13 @@ module private CacheHelpers = | c when System.Char.IsLetter(c) -> c | _ -> '#' - let private getContentPath (fileName: string) = - (*let assemblyDirectory = - Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location) - - Path.Combine(assemblyDirectory, ".", fileName)*) - Path.Combine(Path.GetTempPath(), "BioProviders", fileName) - let private getSpeciesLookupPath (speciesName: string) = let character = getLookupCharacter speciesName - getContentPath $"genbank-species-{character}.txt.gz" + $"genbank-species-{character}.txt.gz" let private getAssemblyLookupPath (speciesName: string) = let character = getLookupCharacter speciesName - getContentPath $"genbank-assemblies-{character}.txt.gz" + $"genbank-assemblies-{character}.txt.gz" let private getSpeciesID (speciesName: string) = let speciesLookupFile = getSpeciesLookupPath speciesName diff --git a/src/DesignTime/DesignTime.fsproj b/src/DesignTime/DesignTime.fsproj index 016e494..d6d4f71 100644 --- a/src/DesignTime/DesignTime.fsproj +++ b/src/DesignTime/DesignTime.fsproj @@ -11,8 +11,6 @@ - - True paket-files/ProvidedTypes.fsi diff --git a/src/RunTime/GenBankFlatFile.fs b/src/RunTime/GenBankFlatFile.fs index 3a30da9..755c407 100644 --- a/src/RunTime/GenBankFlatFile.fs +++ b/src/RunTime/GenBankFlatFile.fs @@ -21,8 +21,9 @@ module GenBankFlatFile = /// let createGenBankFlatFile (path: string) = - // Samuel Smith n7581769. - // Testing deleting old files. + // Delete files that are too old. + // Ideally, we'd have this in a different place, rather than accessed + // any time we want to create a new flat file. CacheAccess.deleteOldFiles // Create DotNet Bio ISequence for the GenBank Flat File.