Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexKenna committed Oct 8, 2023
2 parents 1e642e2 + 7193c7c commit ecaf5dc
Show file tree
Hide file tree
Showing 60 changed files with 252 additions and 31 deletions.
4 changes: 1 addition & 3 deletions .fantomasignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,2 @@
paket-files/
tests/
*.bak
Test.fsx
tests/
8 changes: 1 addition & 7 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -360,10 +360,4 @@ output/
tmp/

# FAKE
.fake

# Code backups
*.bak

# Test script
Test.fsx
.fake
239 changes: 239 additions & 0 deletions DataFileGenerator.fsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
#r "nuget: CsvHelper"
#r "nuget: FluentFTP, 34.0.1"

open CsvHelper
open System.IO
open CsvHelper.Configuration
open System.Globalization
open System.IO.Compression
open FluentFTP

// ------ Record types used for reading and writing files ------
// Rows for the original GenBank TSV file.
type GenBankRow = {
assembly_accession : string
bioproject : string
biosample : string
wgs_master : string
refseq_category : string
taxid : string
species_taxid : string
organism_name : string
infraspecific_name : string
isolate : string
version_status : string
assembly_level : string
release_type : string
genome_rep : string
seq_rel_date : string
asm_name : string
asm_submitter : string
gbrs_paired_asm : string
paired_asm_comp : string
ftp_path : string
excluded_from_refseq : string
relation_to_type_material : string
asm_not_live_date : string
assembly_type : string
group : string
genome_size : string
genome_size_ungapped : string
gc_percent : string
replicon_count : string
scaffold_count : string
contig_count : string
annotation_provider : string
annotation_name : string
annotation_date : string
total_gene_count : string
protein_coding_gene_count : string
non_coding_gene_count : string
pubmed_id : string
}

// Rows for the generated assembly TSV file.
type AssemblyRow = {
species_id : string
assembly_accession : string
ftp_path : string
}

// Rows for the generated species TSV file.
type SpeciesRow = {
species_id : string
species_name : string
}

// Character array.
let characters = Seq.concat [['#']; ['a' .. 'z']]

// Base URL for GenBank files on the FTP server. Used to delete the correct
// number of characters from the FTP path.
let genBankURL = "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/"

// ------ Functions for generating and writing data files ------
// Function for matching the first character of a species name.
// Characters that are not letters are treated as a '#'.
let getLookupCharacter (name: string) =
match name.Chars(0) with
| c when System.Char.IsLetter(c) -> System.Char.ToLower(c)
| _ -> '#'

// Generate a list of distinct species with a unique species ID number for
// each, starting with the specified character.
let getSpeciesList (filteredList : GenBankRow list) (count : int) =
// Get a distinct list of species names.
// Also sorts it into alphabetical order.
let distinctList = List.sort (List.distinct (List.map (fun row -> row.organism_name) filteredList))
// Return a list of SpeciesRows.
List.mapi (fun i name -> { species_id = (i + count).ToString() ; species_name = name }) distinctList

// Generate a list of assemblies belonging to the species of a specified
// character, with the correct ID number for their species.
let getAssemblyList (filteredList : GenBankRow list) (speciesList : SpeciesRow list) =
// Function for finding a species name match for a certain row.
let findNameMatch row = List.tryFind (fun species -> species.species_name.Equals(row.organism_name)) speciesList
// Filter the CSV rows by those that have one of the organism names in the
// supplied list, and that have a FTP path that isn't "na".
let listWithPaths = List.filter (fun (row : GenBankRow) -> not (row.ftp_path.Equals("na"))) filteredList
// Function for sorting a list of AssemblyRows. It should be in the order
// of species IDs, and then the accessions if the IDs are the same.
let sortAssemblies (assembly1 : AssemblyRow) (assembly2: AssemblyRow) =
match assembly1.species_id.CompareTo(assembly2.species_id) with
| 0 -> assembly1.assembly_accession.CompareTo(assembly2.assembly_accession)
| result -> result
// Return a (sorted) list of AssemblyRows.
List.sortWith sortAssemblies (List.map (fun row -> { species_id = ((findNameMatch row).Value.species_id) ; assembly_accession = row.assembly_accession ; ftp_path = row.ftp_path.[(String.length genBankURL)..] } ) listWithPaths)

// Compresses a written text file using GZip compression, writes it to a new
// file and deletes the original.
let compressFile (filename : string) =
let originalFile = File.OpenRead(filename)
let gZipFile = (new FileInfo(filename + ".gz")).Create()
let gZipStream = new GZipStream(gZipFile, CompressionMode.Compress)
originalFile.CopyTo(gZipStream)
gZipStream.Flush()
gZipStream.Close()
originalFile.Close()
File.Delete(filename)

// ------ FTP functions ------

/// Creates and uses a connection with the NCBI FTP server.
let internal useNCBIConnection (callback) =
let serverBaseLocation = "ftp://ftp.ncbi.nlm.nih.gov"
use client = new FtpClient(serverBaseLocation)
client.Connect()
callback client

// Checks if a file exists and if so, whether it is older than the remote
// file.
// - If a file doesn't exist, or is older: return to overwrite existing
// file.
// - Otherwise: return to resume existing file (in case it wasn't
// downloaded fully before).
let isNewerFile (localPath: string) (remotePath: string) (connection: FtpClient) =
if (not (File.Exists(localPath))) then
FtpLocalExists.Overwrite
else
match File.GetLastWriteTime(localPath) > connection.GetModifiedTime(remotePath) with
| true -> FtpLocalExists.Append
| _ -> FtpLocalExists.Overwrite

/// Downloads a file from the NCBI FTP server to the local file system.
let downloadNCBIFile (localPath: string, remotePath: string) =
let downloadFile (connection: FtpClient) =

// Check for changed file as well as verification.
connection.DownloadFile(
localPath,
remotePath,
(isNewerFile localPath remotePath connection),
FtpVerify.Retry
)

useNCBIConnection downloadFile

let downloadedFilePath = (Path.Combine(Path.GetTempPath(), "BioProviders_Build", "downloaded_list.txt"))

// ------ Main operations ------

printfn "------------ Starting operations to generate GenBank data file lists for BioProviders. ------------"

printfn "------ Downloading GenBank summary file to %s...... ------" downloadedFilePath

let status = downloadNCBIFile (downloadedFilePath, "/genomes/genbank/assembly_summary_genbank.txt")

match status with
| FtpStatus.Failed -> failwith "------ Failed to download file from NCBI FTP server. ------"
| FtpStatus.Skipped -> printfn "------ File already downloaded. ------"
| _ -> printfn "------ File downloaded successfully. ------"

printfn "------ Loading in GenBank assembly summary TSV... ------"

// Load in the GenBank file.
(*let reader = new StreamReader("D:\\Users\\Samuel Smith_3\\Documents\\RA\\Downloads\\GenBank FTP\\assembly_summary_genbank_25-09-2023.txt")*)
let reader = new StreamReader(downloadedFilePath)

// A function to skip lines that start with ##, to ignore the comment.
let skipFunction (args : ShouldSkipRecordArgs) =
args.Row[0].StartsWith("##")

// Configuration for the CSV reader. It:
// - Chooses tab as the delimiter;
// - Sets the mode to no escape to ignore quotes;
// - Uses the above function to skip comment lines; and
// - Clear the # symbol on any headers.
let config = new CsvConfiguration(CultureInfo.InvariantCulture)
config.Delimiter <- "\t"
config.Mode <- CsvMode.NoEscape
config.ShouldSkipRecord <- new ShouldSkipRecord(skipFunction)
config.PrepareHeaderForMatch <- fun args -> args.Header.TrimStart('#')

// Create a CSV reader object and get all records in the loaded file.
let csv = new CsvReader(reader, config)
let records = Seq.toList (csv.GetRecords<GenBankRow>())

// Show how many records were loaded.
printfn "Loaded %i records." (List.length records)
printfn "------ TSV loaded successfully. ------"

// Generate a list of species and assembies for the given characater, and write
// them to a file. An integer acculmulator is used to ensure unique numerical
// IDs for all distinct species.
let generateLists (fullList : GenBankRow list) (acc : int) (character : char) =
// Filter the full list of assemblies for only those that have an organism
// name matching the current character.
let filteredList = List.filter (fun row -> (getLookupCharacter row.organism_name).Equals(character)) fullList

// Generate the lists of species and assemblies for the given character.
let speciesList = (getSpeciesList filteredList acc)
let assemblyList = (getAssemblyList filteredList speciesList)

// Generate the filenames for the species and assembly files.
let speciesFilename = $"./build/data/genbank-species-{character}.txt"
let assemblyFilename = $"./build/data/genbank-assemblies-{character}.txt"

// Write the species entries to a file.
let speciesWriter = new StreamWriter(speciesFilename)
let speciesCsv = new CsvWriter(speciesWriter, CultureInfo.InvariantCulture)
speciesCsv.WriteRecords(speciesList)
speciesWriter.Flush()
speciesWriter.Close()
compressFile(speciesFilename)

// Write the assembly entries to a file.
let assemblyWriter = new StreamWriter(assemblyFilename)
let assemblyCsv = new CsvWriter(assemblyWriter, CultureInfo.InvariantCulture)
assemblyCsv.WriteRecords(assemblyList)
assemblyWriter.Flush()
assemblyWriter.Close()
compressFile(assemblyFilename)

// Add the number of new species to the acculmulator, to start at the
// correct number for the next character.
acc + List.length speciesList

printfn "------ Generating new lists from loaded GenBank assembly list... ------"
printfn "------ Successfully generated lists for %i species. ------" (Seq.fold (generateLists records) 0 characters)
printfn "------------ All operations completed. ------------"
Binary file modified build/data/genbank-assemblies-#.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-assemblies-a.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-assemblies-b.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-assemblies-c.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-assemblies-d.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-assemblies-e.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-assemblies-f.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-assemblies-g.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-assemblies-h.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-assemblies-i.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-assemblies-j.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-assemblies-k.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-assemblies-l.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-assemblies-m.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-assemblies-n.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-assemblies-o.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-assemblies-p.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-assemblies-q.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-assemblies-r.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-assemblies-s.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-assemblies-t.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-assemblies-u.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-assemblies-v.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-assemblies-w.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-assemblies-x.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-assemblies-y.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-assemblies-z.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-species-#.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-species-a.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-species-b.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-species-c.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-species-d.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-species-e.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-species-f.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-species-g.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-species-h.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-species-i.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-species-j.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-species-k.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-species-l.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-species-m.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-species-n.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-species-o.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-species-p.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-species-q.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-species-r.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-species-s.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-species-t.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-species-u.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-species-v.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-species-w.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-species-x.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-species-y.txt.gz
Binary file not shown.
Binary file modified build/data/genbank-species-z.txt.gz
Binary file not shown.
25 changes: 8 additions & 17 deletions src/DesignTime/Common.fs
Original file line number Diff line number Diff line change
Expand Up @@ -301,23 +301,21 @@ module private CacheHelpers =
false

// Used to load a data file referring to the location of assemblies on
// GenBank's FTP server.. If the file does not exist in the cache
// GenBank's FTP server. If the file does not exist in the cache
// location, attempts to download it from the FTP server (with the
// above function).
let loadAssemblyList (path: string) =

// This should be changed so we don't need to split up the path
// created to get the filename later.
let filename = Seq.last (path.Split('\\'))
let fullPath = getCacheFilePath path

// Read the existing file if the data file has already been
// downloaded.
if File.Exists(path) then
Some(File.OpenRead(path))
if File.Exists(fullPath) then
Some(File.OpenRead(fullPath))
else
match saveAssemblyList filename with
match saveAssemblyList path with
| false -> None
| _ -> Some(File.OpenRead(path))
| _ -> Some(File.OpenRead(fullPath))

module GenBank =

Expand All @@ -326,20 +324,13 @@ module private CacheHelpers =
| c when System.Char.IsLetter(c) -> c
| _ -> '#'

let private getContentPath (fileName: string) =
(*let assemblyDirectory =
Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location)
Path.Combine(assemblyDirectory, ".", fileName)*)
Path.Combine(Path.GetTempPath(), "BioProviders", fileName)

let private getSpeciesLookupPath (speciesName: string) =
let character = getLookupCharacter speciesName
getContentPath $"genbank-species-{character}.txt.gz"
$"genbank-species-{character}.txt.gz"

let private getAssemblyLookupPath (speciesName: string) =
let character = getLookupCharacter speciesName
getContentPath $"genbank-assemblies-{character}.txt.gz"
$"genbank-assemblies-{character}.txt.gz"

let private getSpeciesID (speciesName: string) =
let speciesLookupFile = getSpeciesLookupPath speciesName
Expand Down
2 changes: 0 additions & 2 deletions src/DesignTime/DesignTime.fsproj
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@
</PropertyGroup>
<ItemGroup>
<None Include="paket.references" />
<!--<None Include="..\..\build\data\**\*.*" LinkBase="\data" CopyToOutputDirectory="PreserveNewest" CopyToPublishDirectory="PreserveNewest" />-->
<None Include="..\..\build\data\**\*.gz" LinkBase="\" CopyToOutputDirectory="PreserveNewest" CopyToPublishDirectory="PreserveNewest" />
<Compile Include="..\..\paket-files\fsprojects\FSharp.TypeProviders.SDK\src\ProvidedTypes.fsi">
<Paket>True</Paket>
<Link>paket-files/ProvidedTypes.fsi</Link>
Expand Down
5 changes: 3 additions & 2 deletions src/RunTime/GenBankFlatFile.fs
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,9 @@ module GenBankFlatFile =
/// </summary>
let createGenBankFlatFile (path: string) =

// Samuel Smith n7581769.
// Testing deleting old files.
// Delete files that are too old.
// Ideally, we'd have this in a different place, rather than accessed
// any time we want to create a new flat file.
CacheAccess.deleteOldFiles

// Create DotNet Bio ISequence for the GenBank Flat File.
Expand Down

0 comments on commit ecaf5dc

Please sign in to comment.