diff --git a/MANIFEST.SKIP b/MANIFEST.SKIP new file mode 100644 index 0000000..2133859 --- /dev/null +++ b/MANIFEST.SKIP @@ -0,0 +1,2 @@ +bin/quince_chiphmmnew +tmp diff --git a/dist.ini b/dist.ini new file mode 100644 index 0000000..383a95e --- /dev/null +++ b/dist.ini @@ -0,0 +1,25 @@ +name = DETCT +author = James Morris +author = Ian Sealy +license = GPL_3 +copyright_holder = Genome Research Ltd +copyright_year = 2013 +version = 0.1.0 + +[@Basic] +[ExecDir] +dir = script +[FileFinder::ByName / ScriptNotR] +dir = script +skip = .*\.R$ +[ModuleBuild] +[PodWeaver] +finder = :InstallModules +finder = ScriptNotR +[PodCoverageTests] +[PodSyntaxTests] +[Test::Perl::Critic] +[PerlTidy] +[AutoPrereqs] +[PkgVersion] +[Test::Compile] diff --git a/lib/DETCT.pm b/lib/DETCT.pm new file mode 100644 index 0000000..8777d00 --- /dev/null +++ b/lib/DETCT.pm @@ -0,0 +1,21 @@ +## no critic (RequireUseStrict, RequireUseWarnings, RequireTidyCode) +package DETCT; +## use critic + +# ABSTRACT: Transcript Counting API + +## Author : is1 +## Maintainer : is1 +## Created : 2012-09-18 +## Last commit by : $Author$ +## Last modified : $Date$ +## Revision : $Revision$ +## Repository URL : $HeadURL$ + +use warnings; +use strict; +use autodie; +use Carp; +use Try::Tiny; + +1; diff --git a/lib/DETCT/Analysis.pm b/lib/DETCT/Analysis.pm new file mode 100644 index 0000000..0899cd4 --- /dev/null +++ b/lib/DETCT/Analysis.pm @@ -0,0 +1,1569 @@ +## no critic (RequireUseStrict, RequireUseWarnings, RequireTidyCode) +package DETCT::Analysis; +## use critic + +# ABSTRACT: Object representing an analysis of a collection of samples + +## Author : is1 +## Maintainer : is1 +## Created : 2012-09-19 +## Last commit by : $Author$ +## Last modified : $Date$ +## Revision : $Revision$ +## Repository URL : $HeadURL$ + +use warnings; +use strict; +use autodie; +use Carp; +use Try::Tiny; + +use Readonly; +use Class::InsideOut qw( private register id ); +use List::MoreUtils qw( uniq ); +use YAML::Tiny; +use Data::Compare; +use DETCT::Sample; +use DETCT::Sequence; +use DETCT::Misc::BAM; + +=head1 SYNOPSIS + + # Brief code examples + +=cut + +# Attributes: +private name => my %name; # e.g. zmp_ph1 +private sample => my %sample; # arrayref of samples +private sequence => my %sequence; # arrayref of sequences +private read1_length => my %read1_length; # e.g. 30 +private read2_length => my %read2_length; # e.g. 54 +private mismatch_threshold => my %mismatch_threshold; # e.g. 2 +private bin_size => my %bin_size; # e.g. 100 +private peak_buffer_width => my %peak_buffer_width; # e.g. 100 +private hmm_sig_level => my %hmm_sig_level; # e.g. 0.001 +private hmm_binary => my %hmm_binary; # e.g. ~/quince_chiphmmnew +private r_binary => my %r_binary; # e.g. R +private deseq_script => my %deseq_script; # e.g. ~/run_deseq.R +private output_sig_level => my %output_sig_level; # e.g. 0.05 +private ref_fasta => my %ref_fasta; # e.g. zv9.fa +private fasta_index => my %fasta_index; # Bio::DB::Sam::Fai +private ensembl_host => my %ensembl_host; # e.g. ensembldb.ensembl.org +private ensembl_port => my %ensembl_port; # e.g. 3306 +private ensembl_user => my %ensembl_user; # e.g. anonymous +private ensembl_pass => my %ensembl_pass; # e.g. secret +private ensembl_name => my %ensembl_name; # e.g. zv9_core +private ensembl_species => my %ensembl_species; # e.g. danio_rerio +private slice_adaptor => my %slice_adaptor; # Bio::EnsEMBL::DBSQL::SliceAdaptor +private chunk_total => my %chunk_total; # e.g. 20 +private chunk => my %chunk; # arrayref of arrayrefs of sequences +private test_chunk => my %test_chunk; # e.g. 1 + +# Constants +Readonly our $MAX_NAME_LENGTH => 128; +Readonly our $DEFAULT_ENSEMBL_HOST => 'ensembldb.ensembl.org'; +Readonly our $DEFAULT_ENSEMBL_USER => 'anonymous'; + +=method new + + Usage : my $analysis = DETCT::Analysis->new( { + name => 'zmp_ph1', + read1_length => 30, + read2_length => 54, + mismatch_threshold => 2, + bin_size => 100, + peak_buffer_width => 100, + hmm_sig_level => 0.001, + hmm_binary => 'bin/quince_chiphmmnew', + r_binary => 'R', + deseq_script => 'script/run_deseq.R', + output_sig_level => 0.05, + chunk_total => 20, + } ); + Purpose : Constructor for analysis objects + Returns : DETCT::Analysis + Parameters : Hashref { + name => String, + read1_length => Int, + read2_length => Int, + mismatch_threshold => Int, + bin_size => Int, + peak_buffer_width => Int, + hmm_sig_level => Float, + hmm_binary => String, + r_binary => String, + deseq_script => String, + output_sig_level => Float, + ref_fasta => String or undef, + ensembl_host => String or undef, + ensembl_port => Int or undef, + ensembl_user => String or undef, + ensembl_pass => String or undef, + ensembl_name => String or undef, + ensembl_species => String or undef, + chunk_total => Int, + test_chunk => Int or undef, + } + Throws : No exceptions + Comments : None + +=cut + +sub new { + my ( $class, $arg_ref ) = @_; + my $self = register($class); + $self->set_name( $arg_ref->{name} ); + $self->set_read1_length( $arg_ref->{read1_length} ); + $self->set_read2_length( $arg_ref->{read2_length} ); + $self->set_mismatch_threshold( $arg_ref->{mismatch_threshold} ); + $self->set_bin_size( $arg_ref->{bin_size} ); + $self->set_peak_buffer_width( $arg_ref->{peak_buffer_width} ); + $self->set_hmm_sig_level( $arg_ref->{hmm_sig_level} ); + $self->set_hmm_binary( $arg_ref->{hmm_binary} ); + $self->set_r_binary( $arg_ref->{r_binary} ); + $self->set_deseq_script( $arg_ref->{deseq_script} ); + $self->set_output_sig_level( $arg_ref->{output_sig_level} ); + $self->set_ref_fasta( $arg_ref->{ref_fasta} ); + $self->set_ensembl_host( $arg_ref->{ensembl_host} ); + $self->set_ensembl_port( $arg_ref->{ensembl_port} ); + $self->set_ensembl_user( $arg_ref->{ensembl_user} ); + $self->set_ensembl_pass( $arg_ref->{ensembl_pass} ); + $self->set_ensembl_name( $arg_ref->{ensembl_name} ); + $self->set_ensembl_species( $arg_ref->{ensembl_species} ); + $self->set_chunk_total( $arg_ref->{chunk_total} ); + $self->set_test_chunk( $arg_ref->{test_chunk} ); + return $self; +} + +=method new_from_yaml + + Usage : my $analysis = DETCT::Analysis->new_from_yaml( 'zmp_ph1.yaml' ); + Purpose : Constructor for creating analysis objects from a YAML file + Returns : DETCT::Analysis + Parameters : String (the YAML file) + Throws : If YAML file is missing or not readable + Comments : None + +=cut + +sub new_from_yaml { + my ( $class, $yaml_file ) = @_; + my $self = register($class); + + confess "YAML file ($yaml_file) does not exist or cannot be read" + if !-r $yaml_file; + + my $yaml = YAML::Tiny->read($yaml_file); + + $self->set_name( $yaml->[0]->{name} ); + $self->set_read1_length( $yaml->[0]->{read1_length} ); + $self->set_read2_length( $yaml->[0]->{read2_length} ); + $self->set_mismatch_threshold( $yaml->[0]->{mismatch_threshold} ); + $self->set_bin_size( $yaml->[0]->{bin_size} ); + $self->set_peak_buffer_width( $yaml->[0]->{peak_buffer_width} ); + $self->set_hmm_sig_level( $yaml->[0]->{hmm_sig_level} ); + $self->set_hmm_binary( $yaml->[0]->{hmm_binary} ); + $self->set_r_binary( $yaml->[0]->{r_binary} ); + $self->set_deseq_script( $yaml->[0]->{deseq_script} ); + $self->set_output_sig_level( $yaml->[0]->{output_sig_level} ); + $self->set_ref_fasta( $yaml->[0]->{ref_fasta} ); + $self->set_ensembl_host( $yaml->[0]->{ensembl_host} ); + $self->set_ensembl_port( $yaml->[0]->{ensembl_port} ); + $self->set_ensembl_user( $yaml->[0]->{ensembl_user} ); + $self->set_ensembl_pass( $yaml->[0]->{ensembl_pass} ); + $self->set_ensembl_name( $yaml->[0]->{ensembl_name} ); + $self->set_ensembl_species( $yaml->[0]->{ensembl_species} ); + $self->set_chunk_total( $yaml->[0]->{chunk_total} ); + $self->set_test_chunk( $yaml->[0]->{test_chunk} ); + + foreach my $sample_hash ( @{ $yaml->[0]->{samples} } ) { + my $sample = DETCT::Sample->new( + { + name => $sample_hash->{name}, + description => $sample_hash->{description}, + condition => $sample_hash->{condition}, + group => $sample_hash->{group}, + tag => $sample_hash->{tag}, + bam_file => $sample_hash->{bam_file}, + } + ); + $self->add_sample( $sample, 1 ); # 1 = do not validate + } + + $self->_validate(); + + return $self; +} + +=method name + + Usage : my $name = $analysis->name; + Purpose : Getter for name attribute + Returns : String (e.g. "zmp_ph1") + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub name { + my ($self) = @_; + return $name{ id $self}; +} + +=method set_name + + Usage : $analysis->set_name('zmp_ph1'); + Purpose : Setter for name attribute + Returns : undef + Parameters : String (the name) + Throws : No exceptions + Comments : None + +=cut + +sub set_name { + my ( $self, $arg ) = @_; + $name{ id $self} = _check_name($arg); + return; +} + +# Usage : $name = _check_name($name); +# Purpose : Check for valid name +# Returns : String (the valid name) +# Parameters : String (the name) +# Throws : If name is missing +# If name is empty +# If name > $MAX_NAME_LENGTH characters +# Comments : None +sub _check_name { + my ($name) = @_; + + confess 'No name specified' if !defined $name; + confess 'Empty name specified' if !length $name; + confess "Name ($name) longer than $MAX_NAME_LENGTH characters" + if length $name > $MAX_NAME_LENGTH; + + return $name; +} + +=method add_sample + + Usage : $analysis->add_sample($sample); + Purpose : Add a sample to an analysis + Returns : undef + Parameters : DETCT::Sample + Defined or undef (indicating if validation is needed) + Throws : If sample is missing or invalid (i.e. not a DETCT::Sample + object) + Comments : None + +=cut + +sub add_sample { + my ( $self, $sample, $no_validaton ) = @_; + + confess 'No sample specified' if !defined $sample; + confess 'Class of sample (', ref $sample, ') not DETCT::Sample' + if !$sample->isa('DETCT::Sample'); + + if ( !exists $sample{ id $self} ) { + $sample{ id $self} = [$sample]; + $self->add_all_sequences( $sample->bam_file ); # Because first sample + } + else { + push @{ $sample{ id $self} }, $sample; + } + + if ( !defined $no_validaton ) { + $self->_validate(); + } + + return; +} + +=method get_all_samples + + Usage : $samples = $analysis->get_all_samples(); + Purpose : Get all samples of an analysis + Returns : Arrayref of DETCT::Sample objects + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub get_all_samples { + my ($self) = @_; + + return $sample{ id $self} || []; +} + +=method add_all_sequences + + Usage : $analysis->add_all_sequences($bam_file); + Purpose : Add all sequences (sorted by decreasing length) to an analysis + Returns : undef + Parameters : String (the BAM file) + Throws : No exceptions + Comments : None + +=cut + +sub add_all_sequences { + my ( $self, $bam_file ) = @_; + + $bam_file = DETCT::Sample::check_bam_file($bam_file); + + $sequence{ id $self} = []; + + my %len = DETCT::Misc::BAM::get_reference_sequence_lengths($bam_file); + + foreach my $name ( reverse sort { $len{$a} <=> $len{$b} } keys %len ) { + my $sequence = DETCT::Sequence->new( + { + name => $name, + bp => $len{$name}, + } + ); + + push @{ $sequence{ id $self} }, $sequence; + } + + # Group sequences into chunks + $self->add_all_chunks(); + + return; +} + +=method get_all_sequences + + Usage : $sequences = $analysis->get_all_sequences(); + Purpose : Get all sequences (sorted by decreasing length) of an analysis + Returns : Arrayref of DETCT::Sequence objects + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub get_all_sequences { + my ($self) = @_; + + return $sequence{ id $self} || []; +} + +# Usage : $analysis->_validate(); +# Purpose : Check analysis +# Returns : 1 +# Parameters : None +# Throws : If reference sequences don't match +# Comments : None +sub _validate { + my ($self) = @_; + + my @bam_files = $self->list_all_bam_files(); + + # Compare reference sequence from first BAM file to all other BAM files + my $first_bam_file = shift @bam_files; + my %first_bam_length = + DETCT::Misc::BAM::get_reference_sequence_lengths($first_bam_file); + foreach my $bam_file (@bam_files) { + my %bam_length = + DETCT::Misc::BAM::get_reference_sequence_lengths($bam_file); + if ( !Compare( \%first_bam_length, \%bam_length ) ) { + confess "$first_bam_file and $bam_file use different reference"; + } + } + + return 1; +} + +=method read1_length + + Usage : my $read1_length = $analysis->read1_length; + Purpose : Getter for read 1 length attribute + Returns : +ve Int + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub read1_length { + my ($self) = @_; + return $read1_length{ id $self}; +} + +=method set_read1_length + + Usage : $analysis->set_read1_length(20); + Purpose : Setter for read 1 length attribute + Returns : undef + Parameters : +ve Int (the read 1 length) + Throws : No exceptions + Comments : None + +=cut + +sub set_read1_length { + my ( $self, $arg ) = @_; + $read1_length{ id $self} = _check_read1_length($arg); + return; +} + +# Usage : $read1_length = _check_read1_length($read1_length); +# Purpose : Check for valid read 1 length +# Returns : +ve Int (the valid read 1 length) +# Parameters : +ve Int (the read 1 length) +# Throws : If read 1 length is missing or not a positive integer +# Comments : None +sub _check_read1_length { + my ($read1_length) = @_; + return $read1_length + if defined $read1_length && $read1_length =~ m/\A \d+ \z/xms; + confess 'No read 1 length specified' if !defined $read1_length; + confess "Invalid read 1 length ($read1_length) specified"; +} + +=method read2_length + + Usage : my $read2_length = $analysis->read2_length; + Purpose : Getter for read 2 length attribute + Returns : +ve Int + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub read2_length { + my ($self) = @_; + return $read2_length{ id $self}; +} + +=method set_read2_length + + Usage : $analysis->set_read2_length(20); + Purpose : Setter for read 2 length attribute + Returns : undef + Parameters : +ve Int (the read 2 length) + Throws : No exceptions + Comments : None + +=cut + +sub set_read2_length { + my ( $self, $arg ) = @_; + $read2_length{ id $self} = _check_read2_length($arg); + return; +} + +# Usage : $read2_length = _check_read2_length($read2_length); +# Purpose : Check for valid read 2 length +# Returns : +ve Int (the valid read 2 length) +# Parameters : +ve Int (the read 2 length) +# Throws : If read 2 length is missing or not a positive integer +# Comments : None +sub _check_read2_length { + my ($read2_length) = @_; + return $read2_length + if defined $read2_length && $read2_length =~ m/\A \d+ \z/xms; + confess 'No read 2 length specified' if !defined $read2_length; + confess "Invalid read 2 length ($read2_length) specified"; +} + +=method mismatch_threshold + + Usage : my $mismatch_threshold = $analysis->mismatch_threshold; + Purpose : Getter for mismatch threshold attribute + Returns : +ve Int + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub mismatch_threshold { + my ($self) = @_; + return $mismatch_threshold{ id $self}; +} + +=method set_mismatch_threshold + + Usage : $analysis->set_mismatch_threshold(20); + Purpose : Setter for mismatch threshold attribute + Returns : undef + Parameters : +ve Int (the mismatch threshold) + Throws : No exceptions + Comments : None + +=cut + +sub set_mismatch_threshold { + my ( $self, $arg ) = @_; + $mismatch_threshold{ id $self} = _check_mismatch_threshold($arg); + return; +} + +# Usage : $mismatch_threshold +# = _check_mismatch_threshold($mismatch_threshold); +# Purpose : Check for valid mismatch threshold +# Returns : +ve Int (the valid mismatch threshold) +# Parameters : +ve Int (the mismatch threshold) +# Throws : If mismatch threshold is missing or not a positive integer +# Comments : None +sub _check_mismatch_threshold { + my ($mismatch_threshold) = @_; + return $mismatch_threshold + if defined $mismatch_threshold && $mismatch_threshold =~ m/\A \d+ \z/xms; + confess 'No mismatch threshold specified' if !defined $mismatch_threshold; + confess "Invalid mismatch threshold ($mismatch_threshold) specified"; +} + +=method bin_size + + Usage : my $bin_size = $analysis->bin_size; + Purpose : Getter for bin size attribute + Returns : +ve Int + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub bin_size { + my ($self) = @_; + return $bin_size{ id $self}; +} + +=method set_bin_size + + Usage : $analysis->set_bin_size(100); + Purpose : Setter for bin size attribute + Returns : undef + Parameters : +ve Int (the bin size) + Throws : No exceptions + Comments : None + +=cut + +sub set_bin_size { + my ( $self, $arg ) = @_; + $bin_size{ id $self} = _check_bin_size($arg); + return; +} + +# Usage : $bin_size = _check_bin_size($bin_size); +# Purpose : Check for valid bin size +# Returns : +ve Int (the valid bin size) +# Parameters : +ve Int (the bin size) +# Throws : If bin size is missing or not a positive integer +# Comments : None +sub _check_bin_size { + my ($bin_size) = @_; + return $bin_size + if defined $bin_size && $bin_size =~ m/\A \d+ \z/xms; + confess 'No bin size specified' if !defined $bin_size; + confess "Invalid bin size ($bin_size) specified"; +} + +=method peak_buffer_width + + Usage : my $peak_buffer_width = $analysis->peak_buffer_width; + Purpose : Getter for peak buffer width attribute + Returns : +ve Int + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub peak_buffer_width { + my ($self) = @_; + return $peak_buffer_width{ id $self}; +} + +=method set_peak_buffer_width + + Usage : $analysis->set_peak_buffer_width(100); + Purpose : Setter for peak buffer width attribute + Returns : undef + Parameters : +ve Int (the peak buffer width) + Throws : No exceptions + Comments : None + +=cut + +sub set_peak_buffer_width { + my ( $self, $arg ) = @_; + $peak_buffer_width{ id $self} = _check_peak_buffer_width($arg); + return; +} + +# Usage : $peak_buffer_width = _check_peak_buffer_width($peak_buffer_width); +# Purpose : Check for valid peak buffer width +# Returns : +ve Int (the valid peak buffer width) +# Parameters : +ve Int (the peak buffer width) +# Throws : If peak buffer width is missing or not a positive integer +# Comments : None +sub _check_peak_buffer_width { + my ($peak_buffer_width) = @_; + return $peak_buffer_width + if defined $peak_buffer_width && $peak_buffer_width =~ m/\A \d+ \z/xms; + confess 'No peak buffer width specified' if !defined $peak_buffer_width; + confess "Invalid peak buffer width ($peak_buffer_width) specified"; +} + +=method hmm_sig_level + + Usage : my $hmm_sig_level = $analysis->hmm_sig_level; + Purpose : Getter for HMM significance level attribute + Returns : +ve Float + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub hmm_sig_level { + my ($self) = @_; + return $hmm_sig_level{ id $self}; +} + +=method set_hmm_sig_level + + Usage : $analysis->set_hmm_sig_level(0.001); + Purpose : Setter for HMM significance level attribute + Returns : undef + Parameters : +ve Float (the HMM significance level) + Throws : No exceptions + Comments : None + +=cut + +sub set_hmm_sig_level { + my ( $self, $arg ) = @_; + $hmm_sig_level{ id $self} = _check_hmm_sig_level($arg); + return; +} + +# Usage : $hmm_sig_level = _check_hmm_sig_level($hmm_sig_level); +# Purpose : Check for valid HMM significance level +# Returns : +ve Float (the valid HMM significance level) +# Parameters : +ve Float (the HMM significance level) +# Throws : If HMM significance level is missing or not a positive float +# Comments : None +sub _check_hmm_sig_level { + my ($hmm_sig_level) = @_; + return $hmm_sig_level + if defined $hmm_sig_level && $hmm_sig_level =~ m/\A \d* [.] \d+ \z/xms; + confess 'No HMM significance level specified' if !defined $hmm_sig_level; + confess "Invalid HMM significance level ($hmm_sig_level) specified"; +} + +=method hmm_binary + + Usage : my $hmm_binary = $analysis->hmm_binary; + Purpose : Getter for HMM binary attribute + Returns : String + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub hmm_binary { + my ($self) = @_; + return $hmm_binary{ id $self}; +} + +=method set_hmm_binary + + Usage : $analysis->set_hmm_binary('bin/quince_chiphmmnew'); + Purpose : Setter for HMM binary attribute + Returns : undef + Parameters : String (the HMM binary) + Throws : No exceptions + Comments : None + +=cut + +sub set_hmm_binary { + my ( $self, $arg ) = @_; + $hmm_binary{ id $self} = _check_hmm_binary($arg); + return; +} + +# Usage : $hmm_binary = _check_hmm_binary($hmm_binary); +# Purpose : Check for valid HMM binary +# Returns : String (the valid HMM binary) +# Parameters : String (the HMM binary) +# Throws : If HMM binary is missing or not readable +# Comments : None +sub _check_hmm_binary { + my ($hmm_binary) = @_; + return $hmm_binary if defined $hmm_binary && -r $hmm_binary; + confess 'No HMM binary specified' if !defined $hmm_binary; + confess "HMM binary ($hmm_binary) does not exist or cannot be read"; +} + +=method r_binary + + Usage : my $r_binary = $analysis->r_binary; + Purpose : Getter for R binary attribute + Returns : String + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub r_binary { + my ($self) = @_; + return $r_binary{ id $self}; +} + +=method set_r_binary + + Usage : $analysis->set_r_binary('R'); + Purpose : Setter for R binary attribute + Returns : undef + Parameters : String (the R binary) + Throws : No exceptions + Comments : None + +=cut + +sub set_r_binary { + my ( $self, $arg ) = @_; + $r_binary{ id $self} = _check_r_binary($arg); + return; +} + +# Usage : $r_binary = _check_r_binary($r_binary); +# Purpose : Check for valid R binary +# Returns : String (the valid R binary) +# Parameters : String (the R binary) +# Throws : If R binary is missing +# Comments : None +sub _check_r_binary { + my ($r_binary) = @_; + return $r_binary if defined $r_binary; + confess 'No R binary specified'; +} + +=method deseq_script + + Usage : my $deseq_script = $analysis->deseq_script; + Purpose : Getter for DESeq script attribute + Returns : String + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub deseq_script { + my ($self) = @_; + return $deseq_script{ id $self}; +} + +=method set_deseq_script + + Usage : $analysis->set_deseq_script('script/run_deseq.R'); + Purpose : Setter for DESeq script attribute + Returns : undef + Parameters : String (the DESeq script) + Throws : No exceptions + Comments : None + +=cut + +sub set_deseq_script { + my ( $self, $arg ) = @_; + $deseq_script{ id $self} = _check_deseq_script($arg); + return; +} + +# Usage : $deseq_script = _check_deseq_script($deseq_script); +# Purpose : Check for valid DESeq script +# Returns : String (the valid DESeq script) +# Parameters : String (the DESeq script) +# Throws : If DESeq script is missing or not readable +# Comments : None +sub _check_deseq_script { + my ($deseq_script) = @_; + return $deseq_script if defined $deseq_script && -r $deseq_script; + confess 'No DESeq script specified' if !defined $deseq_script; + confess "DESeq script ($deseq_script) does not exist or cannot be read"; +} + +=method output_sig_level + + Usage : my $output_sig_level = $analysis->output_sig_level; + Purpose : Getter for output significance level attribute + Returns : +ve Float + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub output_sig_level { + my ($self) = @_; + return $output_sig_level{ id $self}; +} + +=method set_output_sig_level + + Usage : $analysis->set_output_sig_level(0.001); + Purpose : Setter for output significance level attribute + Returns : undef + Parameters : +ve Float (the output significance level) + Throws : No exceptions + Comments : None + +=cut + +sub set_output_sig_level { + my ( $self, $arg ) = @_; + $output_sig_level{ id $self} = _check_output_sig_level($arg); + return; +} + +# Usage : $output_sig_level = _check_output_sig_level($output_sig_level); +# Purpose : Check for valid output significance level +# Returns : +ve Float (the valid output significance level) +# Parameters : +ve Float (the output significance level) +# Throws : If output significance level is missing or not a positive float +# Comments : None +sub _check_output_sig_level { + my ($output_sig_level) = @_; + return $output_sig_level + if defined $output_sig_level + && $output_sig_level =~ m/\A \d* [.] \d+ \z/xms; + confess 'No output significance level specified' + if !defined $output_sig_level; + confess "Invalid output significance level ($output_sig_level) specified"; +} + +=method ref_fasta + + Usage : my $ref_fasta = $analysis->ref_fasta; + Purpose : Getter for reference FASTA attribute + Returns : String + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub ref_fasta { + my ($self) = @_; + return $ref_fasta{ id $self}; +} + +=method set_ref_fasta + + Usage : $analysis->set_ref_fasta('zv9.fa'); + Purpose : Setter for reference FASTA attribute + Returns : undef + Parameters : String (the reference FASTA) + Throws : No exceptions + Comments : None + +=cut + +sub set_ref_fasta { + my ( $self, $arg ) = @_; + $ref_fasta{ id $self} = _check_ref_fasta($arg); + return; +} + +# Usage : $ref_fasta = _check_ref_fasta($ref_fasta); +# Purpose : Check for valid reference FASTA +# Returns : String (the valid reference FASTA) +# Parameters : String (the reference FASTA) +# Throws : If reference FASTA is defined but not readable +# Comments : None +sub _check_ref_fasta { + my ($ref_fasta) = @_; + return $ref_fasta if !defined $ref_fasta || -r $ref_fasta; + confess "Reference FASTA ($ref_fasta) cannot be read"; +} + +=method ensembl_host + + Usage : my $ensembl_host = $analysis->ensembl_host; + Purpose : Getter for Ensembl host attribute + Returns : String + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +=method fasta_index + + Usage : my $fai = $analysis->fasta_index; + Purpose : Getter for FASTA index attribute + Returns : Bio::DB::Sam::Fai + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub fasta_index { + my ($self) = @_; + + if ( !defined $fasta_index{ id $self} && $self->ref_fasta ) { + + # We can create a FASTA index object + $self->set_fasta_index( Bio::DB::Sam::Fai->load( $self->ref_fasta ) ); + } + + return $fasta_index{ id $self}; +} + +=method set_fasta_index + + Usage : $analysis->set_fasta_index($fai); + Purpose : Setter for FASTA index attribute + Returns : undef + Parameters : Bio::DB::Sam::Fai + Throws : No exceptions + Comments : None + +=cut + +sub set_fasta_index { + my ( $self, $arg ) = @_; + $fasta_index{ id $self} = _check_fasta_index($arg); + return; +} + +# Usage : $fai = _check_fasta_index($fai); +# Purpose : Check for valid FASTA index +# Returns : Bio::DB::Sam::Fai +# Parameters : Bio::DB::Sam::Fai +# Throws : If FASTA index is missing or invalid (i.e. not a +# Bio::DB::Sam::Fai object) +# Comments : None +sub _check_fasta_index { + my ($fasta_index) = @_; + return $fasta_index + if defined $fasta_index && $fasta_index->isa('Bio::DB::Sam::Fai'); + confess 'No FASTA index specified' if !defined $fasta_index; + confess 'Class of FASTA index (', ref $fasta_index, + ') not Bio::DB::Sam::Fai'; +} + +sub ensembl_host { + my ($self) = @_; + return $ensembl_host{ id $self}; +} + +=method set_ensembl_host + + Usage : $analysis->set_ensembl_host('ensembldb.ensembl.org'); + Purpose : Setter for Ensembl host attribute + Returns : undef + Parameters : String (the Ensembl host) + Throws : No exceptions + Comments : None + +=cut + +sub set_ensembl_host { + my ( $self, $arg ) = @_; + $ensembl_host{ id $self} = $arg; + return; +} + +=method ensembl_port + + Usage : my $ensembl_port = $analysis->ensembl_port; + Purpose : Getter for Ensembl port attribute + Returns : +ve Int + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub ensembl_port { + my ($self) = @_; + return $ensembl_port{ id $self}; +} + +=method set_ensembl_port + + Usage : $analysis->set_ensembl_port(3306); + Purpose : Setter for Ensembl port attribute + Returns : undef + Parameters : +ve Int (the Ensembl port) + Throws : No exceptions + Comments : None + +=cut + +sub set_ensembl_port { + my ( $self, $arg ) = @_; + $ensembl_port{ id $self} = _check_ensembl_port($arg); + return; +} + +# Usage : $ensembl_port = _check_ensembl_port($ensembl_port); +# Purpose : Check for valid Ensembl port +# Returns : +ve Int (the valid Ensembl port) +# Parameters : +ve Int (the Ensembl port) +# Throws : If Ensembl port is defined but not a positive integer +# Comments : None +sub _check_ensembl_port { + my ($ensembl_port) = @_; + return $ensembl_port + if !defined $ensembl_port || $ensembl_port =~ m/\A \d+ \z/xms; + confess "Invalid Ensembl port ($ensembl_port) specified"; +} + +=method ensembl_user + + Usage : my $ensembl_user = $analysis->ensembl_user; + Purpose : Getter for Ensembl username attribute + Returns : String + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub ensembl_user { + my ($self) = @_; + return $ensembl_user{ id $self}; +} + +=method set_ensembl_user + + Usage : $analysis->set_ensembl_user('anonymous'); + Purpose : Setter for Ensembl username attribute + Returns : undef + Parameters : String (the Ensembl username) + Throws : No exceptions + Comments : None + +=cut + +sub set_ensembl_user { + my ( $self, $arg ) = @_; + $ensembl_user{ id $self} = $arg; + return; +} + +=method ensembl_pass + + Usage : my $ensembl_pass = $analysis->ensembl_pass; + Purpose : Getter for Ensembl password attribute + Returns : String + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub ensembl_pass { + my ($self) = @_; + return $ensembl_pass{ id $self}; +} + +=method set_ensembl_pass + + Usage : $analysis->set_ensembl_pass('secret'); + Purpose : Setter for Ensembl password attribute + Returns : undef + Parameters : String (the Ensembl password) + Throws : No exceptions + Comments : None + +=cut + +sub set_ensembl_pass { + my ( $self, $arg ) = @_; + $ensembl_pass{ id $self} = $arg; + return; +} + +=method ensembl_name + + Usage : my $ensembl_name = $analysis->ensembl_name; + Purpose : Getter for Ensembl database name attribute + Returns : String + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub ensembl_name { + my ($self) = @_; + return $ensembl_name{ id $self}; +} + +=method set_ensembl_name + + Usage : $analysis->set_ensembl_name('zv9_core'); + Purpose : Setter for Ensembl database name attribute + Returns : undef + Parameters : String (the Ensembl database name) + Throws : No exceptions + Comments : None + +=cut + +sub set_ensembl_name { + my ( $self, $arg ) = @_; + $ensembl_name{ id $self} = $arg; + return; +} + +=method ensembl_species + + Usage : my $ensembl_species = $analysis->ensembl_species; + Purpose : Getter for Ensembl species attribute + Returns : String + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub ensembl_species { + my ($self) = @_; + return $ensembl_species{ id $self}; +} + +=method set_ensembl_species + + Usage : $analysis->set_ensembl_species('danio_rerio'); + Purpose : Setter for Ensembl species attribute + Returns : undef + Parameters : String (the Ensembl species) + Throws : No exceptions + Comments : None + +=cut + +sub set_ensembl_species { + my ( $self, $arg ) = @_; + $ensembl_species{ id $self} = $arg; + return; +} + +=method slice_adaptor + + Usage : my $slice_adaptor = $analysis->slice_adaptor; + Purpose : Getter for Ensembl slice adaptor attribute + Returns : Bio::EnsEMBL::DBSQL::SliceAdaptor + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub slice_adaptor { + my ($self) = @_; + + if ( !defined $slice_adaptor{ id $self} + && ( $self->ensembl_species || $self->ensembl_name ) ) + { + # We can create an Ensembl slice adaptor + $self->_create_slice_adaptor(); + } + + return $slice_adaptor{ id $self}; +} + +=method set_slice_adaptor + + Usage : $analysis->set_slice_adaptor($slice_adaptor); + Purpose : Setter for Ensembl slice adaptor attribute + Returns : undef + Parameters : Bio::EnsEMBL::DBSQL::SliceAdaptor + Throws : No exceptions + Comments : None + +=cut + +sub set_slice_adaptor { + my ( $self, $arg ) = @_; + $slice_adaptor{ id $self} = _check_slice_adaptor($arg); + return; +} + +# Usage : $slice_adaptor = _check_slice_adaptor($slice_adaptor); +# Purpose : Check for valid Ensembl slice adaptor +# Returns : Bio::EnsEMBL::DBSQL::SliceAdaptor +# Parameters : Bio::EnsEMBL::DBSQL::SliceAdaptor +# Throws : If slice adaptor is missing or invalid (i.e. not a +# Bio::EnsEMBL::DBSQL::SliceAdaptor object) +# Comments : None +sub _check_slice_adaptor { + my ($slice_adaptor) = @_; + return $slice_adaptor + if defined $slice_adaptor + && $slice_adaptor->isa('Bio::EnsEMBL::DBSQL::SliceAdaptor'); + confess 'No Ensembl slice adaptor specified' if !defined $slice_adaptor; + confess 'Class of Ensembl slice adaptor (', ref $slice_adaptor, + ') not Bio::EnsEMBL::DBSQL::SliceAdaptor'; +} + +=method chunk_total + + Usage : my $chunk_total = $analysis->chunk_total; + Purpose : Getter for chunk total attribute + Returns : +ve Int + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub chunk_total { + my ($self) = @_; + return $chunk_total{ id $self}; +} + +=method set_chunk_total + + Usage : $analysis->set_chunk_total(20); + Purpose : Setter for chunk total attribute + Returns : undef + Parameters : +ve Int (the chunk total) + Throws : No exceptions + Comments : None + +=cut + +sub set_chunk_total { + my ( $self, $arg ) = @_; + $chunk_total{ id $self} = _check_chunk_total($arg); + + # Recalculate chunks if necessary + if ( scalar @{ $self->get_all_samples() } ) { + $self->add_all_chunks(); + } + + return; +} + +# Usage : $chunk_total = _check_chunk_total($chunk_total); +# Purpose : Check for valid chunk total +# Returns : +ve Int (the valid chunk total) +# Parameters : +ve Int (the chunk total) +# Throws : If chunk total is missing or not a positive integer +# Comments : None +sub _check_chunk_total { + my ($chunk_total) = @_; + return $chunk_total + if defined $chunk_total && $chunk_total =~ m/\A \d+ \z/xms; + confess 'No chunk total specified' if !defined $chunk_total; + confess "Invalid chunk total ($chunk_total) specified"; +} + +=method test_chunk + + Usage : my $test_chunk = $analysis->test_chunk; + Purpose : Getter for test chunk attribute + Returns : +ve Int + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub test_chunk { + my ($self) = @_; + return $test_chunk{ id $self}; +} + +=method set_test_chunk + + Usage : $analysis->set_test_chunk(1); + Purpose : Setter for test chunk attribute + Returns : undef + Parameters : +ve Int (the test chunk) + Throws : No exceptions + Comments : None + +=cut + +sub set_test_chunk { + my ( $self, $arg ) = @_; + $test_chunk{ id $self} = $arg; + return; +} + +=method add_all_chunks + + Usage : $analysis->add_all_chunks(); + Purpose : Add all chunks (groups of sequences) to an analysis + Returns : undef + Parameters : None + Throws : No exceptions + Comments : Groups all sequences into a specific number of (roughly equally + sized) chunks + +=cut + +sub add_all_chunks { + my ($self) = @_; + + my @seqs = @{ $self->get_all_sequences() }; + + # Get total sequence length + my $total_bp = 0; + foreach my $seq (@seqs) { + $total_bp += $seq->bp; + } + + # Get chunk target size (+ 1 to ensure slight overestimate) + my $target_chunk_size = int( $total_bp / $self->chunk_total + 1 ); + + my @chunks; + my @chunk_size = map { 0 } 1 .. $self->chunk_total; + + # Iterate over sequences + SEQ: foreach my $seq (@seqs) { + + # Iterate over each chunk + foreach my $chunk_index ( 0 .. $self->chunk_total - 1 ) { + + # Add sequence to chunk if there's room or if the chunk is empty + if ( $chunk_size[$chunk_index] + $seq->bp <= $target_chunk_size + || $chunk_size[$chunk_index] == 0 ) + { + push @{ $chunks[$chunk_index] }, $seq; + $chunk_size[$chunk_index] += $seq->bp; + next SEQ; # Next sequence + } + } + + # Sequence hasn't been added to a chunk, so add to chunk with most room + my $roomy_chunk_index = 0; + foreach my $chunk_index ( 0 .. $self->chunk_total - 1 ) { + if ( $chunk_size[$chunk_index] < $chunk_size[$roomy_chunk_index] ) { + $roomy_chunk_index = $chunk_index; + } + } + push @{ $chunks[$roomy_chunk_index] }, $seq; + $chunk_size[$roomy_chunk_index] += $seq->bp; + } + + # Iterate over empty chunks in order to attempt to add sequences to them + foreach my $empty_chunk_index ( 0 .. $self->chunk_total - 1 ) { + next if defined $chunks[$empty_chunk_index]; # Only want empty chunks + + # Find chunk with highest number of sequences (but more than one) + my $max_seqs_chunk_index; + my $max_seqs; + foreach my $chunk_index ( 0 .. $self->chunk_total - 1 ) { + next if !defined $chunks[$chunk_index]; # Only want non-empty chunks + my $seqs = scalar @{ $chunks[$chunk_index] }; + if ( $seqs > 1 && ( !defined $max_seqs || $seqs > $max_seqs ) ) { + $max_seqs_chunk_index = $chunk_index; + $max_seqs = $seqs; + } + } + + last if !defined $max_seqs; # No splittable chunks + + # Split chosen chunk into empty chunk + my $split_index = int( $max_seqs / 2 ); + @{ $chunks[$empty_chunk_index] } = + splice @{ $chunks[$max_seqs_chunk_index] }, 0, $split_index; + } + + $chunk{ id $self} = \@chunks; + + # Number of chunks may be smaller than requested chunk total, so adjust + $chunk_total{ id $self} = scalar @chunks; + + return; +} + +=method get_all_chunks + + Usage : $chunks = $analysis->get_all_chunks(); + Purpose : Get all chunks (groups of sequences) of an analysis + Returns : Arrayref of arrayrefs of DETCT::Sequence objects + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub get_all_chunks { + my ($self) = @_; + + my $chunks = $chunk{ id $self} || []; + + # If a test chunk is specified then only return that chunk not all chunks + if ( $self->test_chunk && exists $chunks->[ $self->test_chunk - 1 ] ) { + $chunks = [ $chunks->[ $self->test_chunk - 1 ] ]; + } + + return $chunks; +} + +=method list_all_bam_files + + Usage : @bam_files = $analysis->list_all_bam_files(); + Purpose : Get all BAM files used in an analysis + Returns : Arrayref of strings + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub list_all_bam_files { + my ($self) = @_; + + my $samples = $self->get_all_samples(); + + my @bam_files = map { $_->bam_file } @{$samples}; + + return uniq( sort @bam_files ); +} + +=method list_all_tags_by_bam_file + + Usage : @tags = $analysis->list_all_tags_by_bam_file(); + Purpose : Get all tags used in an analysis in a particular BAM file + Returns : Arrayref of strings + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub list_all_tags_by_bam_file { + my ( $self, $bam_file ) = @_; + + my $samples = $self->get_all_samples(); + + my @tags = map { $_->tag } grep { $_->bam_file eq $bam_file } @{$samples}; + + return uniq( sort @tags ); +} + +=method get_subsequence + + Usage : $seq = $analysis->get_subsequence('1', 1, 10); + Purpose : Get subsequence from reference + Returns : String (sequence) + Parameters : String (the sequence name) + Int (the sequence start) + Int (the sequence end) + Int (the sequence strand) + Throws : If sequence name is missing + If sequence start is missing + If sequence end is missing + If sequence strand is missing + Comments : None + +=cut + +sub get_subsequence { + my ( $self, $seq_name, $start, $end, $strand ) = @_; + + confess 'No sequence name specified' if !defined $seq_name; + confess 'No sequence start specified' if !defined $start; + confess 'No sequence end specified' if !defined $end; + confess 'No sequence strand specified' if !defined $strand; + + # Avoid negative positions (but don't worry if end is larger than sequence) + if ( $start < 1 ) { + $start = 1; + } + if ( $end < 1 ) { + $end = 1; + } + + my $subseq; + + if ( $self->fasta_index ) { + $subseq = DETCT::Misc::BAM::get_sequence( + { + fasta_index => $self->fasta_index, + seq_name => $seq_name, + start => $start, + end => $end, + strand => $strand, + } + ); + } + elsif ( $self->slice_adaptor ) { + $subseq = + $self->slice_adaptor->fetch_by_region( 'toplevel', $seq_name, $start, + $end, $strand )->seq; + } + else { + confess 'No reference FASTA or Ensembl database'; + } + + return uc $subseq; +} + +# Usage : $self->_create_slice_adaptor(); +# Purpose : Create an Ensembl slice adaptor +# Returns : Undef +# Parameters : None +# Throws : No exceptions +# Comments : None +sub _create_slice_adaptor { + my ($self) = @_; + + my $host = + $self->ensembl_host ? $self->ensembl_host : $DEFAULT_ENSEMBL_HOST; + my $port = $self->ensembl_port; + my $user = + $self->ensembl_user ? $self->ensembl_user : $DEFAULT_ENSEMBL_USER; + my $pass = $self->ensembl_pass; + my $slice_adaptor; + if ( !$self->ensembl_name ) { + + # Get slice adaptor via registry + require Bio::EnsEMBL::Registry; + Bio::EnsEMBL::Registry->load_registry_from_db( + -host => $host, + -port => $port, + -user => $user, + -pass => $pass, + -species => $self->ensembl_species, + ); + $slice_adaptor = + Bio::EnsEMBL::Registry->get_adaptor( $self->ensembl_species, 'core', + 'slice' ); + } + else { + # Get slice adaptor from specific database + require Bio::EnsEMBL::DBSQL::DBAdaptor; + my $ensembl_db = Bio::EnsEMBL::DBSQL::DBAdaptor->new( + -host => $host, + -port => $port, + -user => $user, + -pass => $pass, + -dbname => $self->ensembl_name, + ); + $slice_adaptor = $ensembl_db->get_SliceAdaptor(); + } + + $self->set_slice_adaptor($slice_adaptor); + + return; +} + +1; diff --git a/lib/DETCT/Gene.pm b/lib/DETCT/Gene.pm new file mode 100644 index 0000000..4bc5d88 --- /dev/null +++ b/lib/DETCT/Gene.pm @@ -0,0 +1,574 @@ +## no critic (RequireUseStrict, RequireUseWarnings, RequireTidyCode) +package DETCT::Gene; +## use critic + +# ABSTRACT: Object representing a gene + +## Author : is1 +## Maintainer : is1 +## Created : 2012-11-24 +## Last commit by : $Author$ +## Last modified : $Date$ +## Revision : $Revision$ +## Repository URL : $HeadURL$ + +use warnings; +use strict; +use autodie; +use Carp; +use Try::Tiny; + +use Readonly; +use Class::InsideOut qw( private register id ); +use Scalar::Util qw( weaken ); + +=head1 SYNOPSIS + + # Brief code examples + +=cut + +# Attributes: +private genebuild_version => my %genebuild_version; # e.g. e69 +private stable_id => my %stable_id; # e.g. ENSDARG00000095747 +private name => my %name; # e.g. cxc64 +private description => my %description; # e.g. CXC chemokine 64... +private biotype => my %biotype; # e.g. protein_coding +private seq_name => my %seq_name; # e.g. 5 +private start => my %start; # e.g. 40352744 +private end => my %end; # e.g. 40354399 +private strand => my %strand; # e.g. 1 +private transcript => my %transcript; # DETCT::Transcript + +# Constants +Readonly our $MAX_NAME_LENGTH => 128; + +=method new + + Usage : my $gene = DETCT::Gene->new( { + genebuild_version => 'e61', + stable_id => 'ENSDARG00000095747', + biotype => 'protein_coding', + seq_name => '5', + start => 40352744, + end => 40354399, + strand => 1, + } ); + Purpose : Constructor for gene objects + Returns : DETCT::Gene + Parameters : Hashref { + genebuild_version => String, + stable_id => String, + name => String or undef, + description => String or undef, + biotype => String, + seq_name => String, + start => +ve Int, + end => +ve Int, + strand => Int (1 or -1), + } + Throws : No exceptions + Comments : None + +=cut + +sub new { + my ( $class, $arg_ref ) = @_; + my $self = register($class); + $self->set_genebuild_version( $arg_ref->{genebuild_version} ); + $self->set_stable_id( $arg_ref->{stable_id} ); + $self->set_name( $arg_ref->{name} ); + $self->set_description( $arg_ref->{description} ); + $self->set_biotype( $arg_ref->{biotype} ); + $self->set_seq_name( $arg_ref->{seq_name} ); + $self->set_start( $arg_ref->{start} ); + $self->set_end( $arg_ref->{end} ); + $self->set_strand( $arg_ref->{strand} ); + return $self; +} + +=method genebuild_version + + Usage : my $gv = $gene->genebuild_version; + Purpose : Getter for genebuild version attribute + Returns : String (e.g. "e61" for Ensembl 61) + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub genebuild_version { + my ($self) = @_; + return $genebuild_version{ id $self}; +} + +=method set_genebuild_version + + Usage : $gene->set_genebuild_version('e61'); + Purpose : Setter for genebuild version attribute + Returns : undef + Parameters : String (the genebuild version) + Throws : No exceptions + Comments : None + +=cut + +sub set_genebuild_version { + my ( $self, $arg ) = @_; + $genebuild_version{ id $self} = check_genebuild_version($arg); + return; +} + +=method check_genebuild_version + + Usage : $gv = check_genebuild_version($gv); + Purpose : Check for valid genebuild version + Returns : String (the valid genebuild version) + Parameters : String (the genebuild version) + Throws : If genebuild version is missing or invalid (i.e. not + alphanumeric) + Comments : None + +=cut + +sub check_genebuild_version { + my ($genebuild_version) = @_; + return $genebuild_version + if defined $genebuild_version && $genebuild_version =~ m/\A \w+ \z/xms; + confess 'No genebuild version specified' if !defined $genebuild_version; + confess "Invalid genebuild version ($genebuild_version) specified"; +} + +=method stable_id + + Usage : my $stable_id = $gene->stable_id; + Purpose : Getter for stable id attribute + Returns : String (e.g. "ENSDARG00000095747") + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub stable_id { + my ($self) = @_; + return $stable_id{ id $self}; +} + +=method set_stable_id + + Usage : $gene->set_stable_id('ENSDARG00000095747'); + Purpose : Setter for stable id attribute + Returns : undef + Parameters : String (the stable id) + Throws : No exceptions + Comments : None + +=cut + +sub set_stable_id { + my ( $self, $arg ) = @_; + $stable_id{ id $self} = check_stable_id($arg); + return; +} + +=method check_stable_id + + Usage : $stable_id = check_stable_id($stable_id); + Purpose : Check for valid stable id + Returns : String (the valid stable id) + Parameters : String (the stable id) + Throws : If stable id is missing or invalid + Comments : None + +=cut + +sub check_stable_id { + my ($stable_id) = @_; + return $stable_id + if defined $stable_id && $stable_id =~ m/\A [[:upper:]]+ \d{11} \z/xms; + confess 'No stable id specified' if !defined $stable_id; + confess "Invalid stable id ($stable_id) specified"; +} + +=method name + + Usage : my $name = $gene->name; + Purpose : Getter for name attribute + Returns : String (e.g. "cxc64") + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub name { + my ($self) = @_; + return $name{ id $self}; +} + +=method set_name + + Usage : $gene->set_name('cxc64'); + Purpose : Setter for name attribute + Returns : undef + Parameters : String (the name) + Throws : No exceptions + Comments : None + +=cut + +sub set_name { + my ( $self, $arg ) = @_; + $name{ id $self} = _check_name($arg); + return; +} + +# Usage : $name = _check_name($name); +# Purpose : Check for valid name +# Returns : String (the valid name) +# Parameters : String (the name) +# Throws : If name > $MAX_NAME_LENGTH characters +# Comments : None +sub _check_name { + my ($name) = @_; + return $name + if !defined $name + || ( length $name > 0 && length $name <= $MAX_NAME_LENGTH ); + confess 'Name is empty' if !length $name; + confess "Name ($name) longer than $MAX_NAME_LENGTH characters"; +} + +=method description + + Usage : my $description = $gene->description; + Purpose : Getter for description attribute + Returns : String (e.g. "CXC chemokine 64") + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub description { + my ($self) = @_; + return $description{ id $self}; +} + +=method set_description + + Usage : $gene->set_description('CXC chemokine 64'); + Purpose : Setter for description attribute + Returns : undef + Parameters : String (the description) + Throws : No exceptions + Comments : None + +=cut + +sub set_description { + my ( $self, $arg ) = @_; + $description{ id $self} = $arg; + return; +} + +=method biotype + + Usage : my $biotype = $gene->biotype; + Purpose : Getter for biotype attribute + Returns : String (e.g. "protein_coding") + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub biotype { + my ($self) = @_; + return $biotype{ id $self}; +} + +=method set_biotype + + Usage : $gene->set_biotype('protein_coding'); + Purpose : Setter for biotype attribute + Returns : undef + Parameters : String (the biotype) + Throws : No exceptions + Comments : None + +=cut + +sub set_biotype { + my ( $self, $arg ) = @_; + $biotype{ id $self} = check_biotype($arg); + return; +} + +=method check_biotype + + Usage : $biotype = check_biotype($biotype); + Purpose : Check for valid biotype + Returns : String (the valid biotype) + Parameters : String (the biotype) + Throws : If biotype is missing or invalid (i.e. not alphanumeric) + Comments : None + +=cut + +sub check_biotype { + my ($biotype) = @_; + return $biotype if defined $biotype && $biotype =~ m/\A \w+ \z/xms; + confess 'No biotype specified' if !defined $biotype; + confess "Invalid biotype ($biotype) specified"; +} + +=method seq_name + + Usage : my $seq_name = $gene->seq_name; + Purpose : Getter for sequence name attribute + Returns : String (e.g. "5") + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub seq_name { + my ($self) = @_; + return $seq_name{ id $self}; +} + +=method set_seq_name + + Usage : $gene->set_seq_name('5'); + Purpose : Setter for sequence name attribute + Returns : undef + Parameters : String (the sequence name) + Throws : No exceptions + Comments : None + +=cut + +sub set_seq_name { + my ( $self, $arg ) = @_; + $seq_name{ id $self} = check_seq_name($arg); + return; +} + +=method check_seq_name + + Usage : $seq_name = check_seq_name($seq_name); + Purpose : Check for valid sequence name + Returns : String (the valid sequence name) + Parameters : String (the sequence name) + Throws : If sequence name is missing or invalid (i.e. not alphanumeric) + Comments : None + +=cut + +sub check_seq_name { + my ($seq_name) = @_; + return $seq_name if defined $seq_name && $seq_name =~ m/\A \w+ \z/xms; + confess 'No sequence name specified' if !defined $seq_name; + confess "Invalid sequence name ($seq_name) specified"; +} + +=method start + + Usage : my $start = $gene->start; + Purpose : Getter for start attribute + Returns : +ve Int + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub start { + my ($self) = @_; + return $start{ id $self}; +} + +=method set_start + + Usage : $gene->set_start(40352744); + Purpose : Setter for start attribute + Returns : undef + Parameters : +ve Int (the start) + Throws : No exceptions + Comments : None + +=cut + +sub set_start { + my ( $self, $arg ) = @_; + $start{ id $self} = check_start($arg); + return; +} + +=method check_start + + Usage : $start = check_start($start); + Purpose : Check for valid start + Returns : +ve Int (the valid start) + Parameters : +ve Int (the start) + Throws : If start is missing or not a positive integer + Comments : None + +=cut + +sub check_start { + my ($start) = @_; + return $start if defined $start && $start =~ m/\A \d+ \z/xms; + confess 'No start specified' if !defined $start; + confess "Invalid start ($start) specified"; +} + +=method end + + Usage : my $end = $gene->end; + Purpose : Getter for end attribute + Returns : +ve Int + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub end { + my ($self) = @_; + return $end{ id $self}; +} + +=method set_end + + Usage : $gene->set_end(40352744); + Purpose : Setter for end attribute + Returns : undef + Parameters : +ve Int (the end) + Throws : No exceptions + Comments : None + +=cut + +sub set_end { + my ( $self, $arg ) = @_; + $end{ id $self} = check_end($arg); + return; +} + +=method check_end + + Usage : $end = check_end($end); + Purpose : Check for valid end + Returns : +ve Int (the valid end) + Parameters : +ve Int (the end) + Throws : If end is missing or not a positive integer + Comments : None + +=cut + +sub check_end { + my ($end) = @_; + return $end if defined $end && $end =~ m/\A \d+ \z/xms; + confess 'No end specified' if !defined $end; + confess "Invalid end ($end) specified"; +} + +=method strand + + Usage : my $strand = $gene->strand; + Purpose : Getter for strand attribute + Returns : Int (1 or -1) + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub strand { + my ($self) = @_; + return $strand{ id $self}; +} + +=method set_strand + + Usage : $gene->set_strand(1); + Purpose : Setter for strand attribute + Returns : undef + Parameters : Int (the strand) + Throws : No exceptions + Comments : None + +=cut + +sub set_strand { + my ( $self, $arg ) = @_; + $strand{ id $self} = _check_strand($arg); + return; +} + +# Usage : $strand = _check_strand($strand); +# Purpose : Check for valid strand +# Returns : Int (1 or -1) (the valid strand) +# Parameters : Int (1 or -1) (the strand) +# Throws : If strand is missing or not 1 or -1 +# Comments : None +sub _check_strand { + my ($strand) = @_; + return $strand if defined $strand && $strand =~ m/\A \-? 1 \z/xms; + confess 'No strand specified' if !defined $strand; + confess "Invalid strand ($strand) specified"; +} + +=method add_transcript + + Usage : $gene->add_transcript($transcript); + Purpose : Add a transcript to a gene + Returns : undef + Parameters : DETCT::Transcript + Throws : If transcript is missing or invalid (i.e. not a + DETCT::Transcript object) + Comments : None + +=cut + +sub add_transcript { + my ( $self, $transcript ) = @_; + + confess 'No transcript specified' if !defined $transcript; + confess 'Class of transcript (', ref $transcript, ') not DETCT::Transcript' + if !$transcript->isa('DETCT::Transcript'); + + weaken($transcript); # Avoid circular references + + if ( !exists $transcript{ id $self} ) { + $transcript{ id $self} = [$transcript]; + } + else { + push @{ $transcript{ id $self} }, $transcript; + } + + return; +} + +=method get_all_transcripts + + Usage : $transcripts = $gene->get_all_transcripts(); + Purpose : Get all transcripts of a gene + Returns : Arrayref of DETCT::Transcript objects + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub get_all_transcripts { + my ($self) = @_; + + return $transcript{ id $self} || []; +} + +1; diff --git a/lib/DETCT/GeneFinder.pm b/lib/DETCT/GeneFinder.pm new file mode 100644 index 0000000..33269e4 --- /dev/null +++ b/lib/DETCT/GeneFinder.pm @@ -0,0 +1,417 @@ +## no critic (RequireUseStrict, RequireUseWarnings, RequireTidyCode) +package DETCT::GeneFinder; +## use critic + +# ABSTRACT: Object for finding genes (and transcripts) by location + +## Author : is1 +## Maintainer : is1 +## Created : 2012-11-24 +## Last commit by : $Author$ +## Last modified : $Date$ +## Revision : $Revision$ +## Repository URL : $HeadURL$ + +use warnings; +use strict; +use autodie; +use Carp; +use Try::Tiny; + +use Class::InsideOut qw( private register id ); +use DETCT::Gene; +use DETCT::Transcript; + +=head1 SYNOPSIS + + # Brief code examples + +=cut + +# Attributes: +private slice_adaptor => my %slice_adaptor; # Bio::EnsEMBL::DBSQL::SliceAdaptor +private cache => my %cache; # Hashref + +=method new + + Usage : my $gene_finder = DETCT::GeneFinder->new( { + slice_adaptor => $slice_adaptor, + } ); + Purpose : Constructor for gene finder objects + Returns : DETCT::GeneFinder + Parameters : Hashref { + slice_adaptor => Bio::EnsEMBL::DBSQL::SliceAdaptor, + } + Throws : No exceptions + Comments : None + +=cut + +sub new { + my ( $class, $arg_ref ) = @_; + my $self = register($class); + $self->set_slice_adaptor( $arg_ref->{slice_adaptor} ); + return $self; +} + +=method slice_adaptor + + Usage : my $slice_adaptor = $analysis->slice_adaptor; + Purpose : Getter for Ensembl slice adaptor attribute + Returns : Bio::EnsEMBL::DBSQL::SliceAdaptor + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub slice_adaptor { + my ($self) = @_; + return $slice_adaptor{ id $self}; +} + +=method set_slice_adaptor + + Usage : $analysis->set_slice_adaptor($slice_adaptor); + Purpose : Setter for Ensembl slice adaptor attribute + Returns : undef + Parameters : Bio::EnsEMBL::DBSQL::SliceAdaptor + Throws : No exceptions + Comments : None + +=cut + +sub set_slice_adaptor { + my ( $self, $arg ) = @_; + $slice_adaptor{ id $self} = _check_slice_adaptor($arg); + return; +} + +# Usage : $slice_adaptor = _check_slice_adaptor($slice_adaptor); +# Purpose : Check for valid Ensembl slice adaptor +# Returns : Bio::EnsEMBL::DBSQL::SliceAdaptor +# Parameters : Bio::EnsEMBL::DBSQL::SliceAdaptor +# Throws : If slice adaptor is missing or invalid (i.e. not a +# Bio::EnsEMBL::DBSQL::SliceAdaptor object) +# Comments : None +sub _check_slice_adaptor { + my ($slice_adaptor) = @_; + return $slice_adaptor + if defined $slice_adaptor + && $slice_adaptor->isa('Bio::EnsEMBL::DBSQL::SliceAdaptor'); + confess 'No Ensembl slice adaptor specified' if !defined $slice_adaptor; + confess 'Class of Ensembl slice adaptor (', ref $slice_adaptor, + ') not Bio::EnsEMBL::DBSQL::SliceAdaptor'; +} + +=method get_nearest_transcripts + + Usage : $gene_finder->get_nearest_transcripts($seq_name, $pos, $strand); + Purpose : Retrieve the nearest transcripts to a 3' end + Returns : Arrayref (of DETCT::Transcript objects) + Int (distance) + Int (nearest 3' end position) + Parameters : String (the 3' end sequence name) + Int (the 3' end position) + Int (the 3' end strand) + Throws : No exceptions + Comments : Distance is positive if downstream of 3' end and negative if + upstream + +=cut + +sub get_nearest_transcripts { + my ( $self, $seq_name, $pos, $strand ) = @_; + + # Ensure cache is filled + $self->_fill_cache_from_ensembl($seq_name); + + my $nearest_distance; + my $nearest_transcripts = []; + my $nearest_end_pos; + + # Iterate over all 3' end transcript positions in relevant portion of cache + my @transcript_positions = keys %{ $cache{ id $self}->{$seq_name} }; + + # Favour upstream if get two transcripts same distance upstream and + # downstream (and strand is known) + @transcript_positions = sort { $a <=> $b } @transcript_positions; + ## no critic (ProhibitMagicNumbers) + if ( defined $strand && $strand == -1 ) { + ## use critic + @transcript_positions = reverse @transcript_positions; + } + + foreach my $transcript_position (@transcript_positions) { + my @transcripts = + @{ $cache{ id $self}->{$seq_name}->{$transcript_position} }; + + # Only consider transcripts matching strand (if specified) + if ( defined $strand ) { + @transcripts = grep { $_->strand == $strand } @transcripts; + } + next if !@transcripts; + + my $distance = $pos - $transcript_position; + ## no critic (ProhibitMagicNumbers) + if ( $transcripts[0]->strand == -1 ) { + ### use critic + $distance = -$distance; + } + + # Keep transcripts if nearer than seen before + if ( !defined $nearest_distance + || abs $distance < abs $nearest_distance ) + { + $nearest_transcripts = \@transcripts; + $nearest_distance = $distance; + $nearest_end_pos = $transcript_position; + } + } + + # Sort by stable id + @{$nearest_transcripts} = + sort { $a->stable_id cmp $b->stable_id } @{$nearest_transcripts}; + + return $nearest_transcripts, $nearest_distance, $nearest_end_pos; +} + +=method get_nearest_genes + + Usage : $gene_finder->get_nearest_genes($seq_name, $pos, $strand); + Purpose : Retrieve the nearest genes to a 3' end + Returns : Arrayref (of DETCT::Gene objects) + Int (distance) + Int (nearest 3' end position) + Parameters : String (the 3' end sequence name) + Int (the 3' end position) + Int (the 3' end strand) + Throws : No exceptions + Comments : Distance is positive if downstream of 3' end and negative if + upstream + +=cut + +sub get_nearest_genes { + my ( $self, $seq_name, $pos, $strand ) = @_; + + my ( $transcripts, $distance, $nearest_end_pos ) = + $self->get_nearest_transcripts( $seq_name, $pos, $strand ); + + my %tmp_cache; # Temporarily store genes by stable id + + # Get all genes corresponding to these transcripts + foreach my $transcript ( @{$transcripts} ) { + $tmp_cache{ $transcript->gene->stable_id } = $transcript->gene; + } + + my $nearest_genes = [ values %tmp_cache ]; + + # Sort by stable id + @{$nearest_genes} = + sort { $a->stable_id cmp $b->stable_id } @{$nearest_genes}; + + return $nearest_genes, $distance, $nearest_end_pos; +} + +# Usage : $self->_fill_cache_from_ensembl( $seq_name ); +# Purpose : Fill the cache from Ensembl for a particular sequence +# Returns : undef +# Parameters : String (the sequence name) +# Throws : No exceptions +# Comments : Cache is a hashref (keyed by sequence name) of hashrefs (keyed +# by 3' end position) of arrayrefs of transcripts + +sub _fill_cache_from_ensembl { + my ( $self, $seq_name ) = @_; + + # Skip if cache already filled + return if exists $cache{ id $self}->{$seq_name}; + + # Make sure default key exists (in case there are no genes) + $cache{ id $self}->{$seq_name} = {}; + + my $slice = $self->slice_adaptor->fetch_by_region( 'toplevel', $seq_name ); + + require Bio::EnsEMBL::ApiVersion; + my $genebuild_version = 'e' . Bio::EnsEMBL::ApiVersion::software_version(); + + my $ens_genes = $slice->get_all_Genes( undef, undef, 1 ); # Plus transcripts + foreach my $ens_gene ( @{$ens_genes} ) { + my $gene = DETCT::Gene->new( + { + genebuild_version => $genebuild_version, + stable_id => $ens_gene->stable_id, + name => $ens_gene->external_name, + description => $ens_gene->description, + biotype => $ens_gene->biotype, + seq_name => $seq_name, + start => $ens_gene->seq_region_start, + end => $ens_gene->seq_region_end, + strand => $ens_gene->seq_region_strand, + } + ); + + # Get 3' end position for each transcript + my $ens_transcripts = $ens_gene->get_all_Transcripts(); + foreach my $ens_transcript ( @{$ens_transcripts} ) { + my $transcript = DETCT::Transcript->new( + { + stable_id => $ens_transcript->stable_id, + name => $ens_transcript->external_name, + description => $ens_transcript->description, + biotype => $ens_transcript->biotype, + seq_name => $seq_name, + start => $ens_transcript->seq_region_start, + end => $ens_transcript->seq_region_end, + strand => $ens_transcript->seq_region_strand, + gene => $gene, + } + ); + $gene->add_transcript($transcript); + + my $pos = + $ens_transcript->seq_region_strand == 1 + ? $ens_transcript->seq_region_end + : $ens_transcript->seq_region_start; + + push @{ $cache{ id $self}->{$seq_name}->{$pos} }, $transcript; + } + } + + return; +} + +=method add_gene_annotation + + Usage : my $regions_ref + = $gene_finder->add_gene_annotation($regions_ary_ref); + Purpose : Add gene annotation to regions with 3' ends + Returns : Arrayref [ + Arrayref [ + String (region sequence name), + Int (region start), + Int (region end), + Int (region maximum read count), + Float (region log probability sum), + String (3' end sequence name) or undef, + Int (3' end position) or undef, + Int (3' end strand) or undef, + Int (3' end read count) or undef, + Arrayref [ + Int (count) + ... + ], + Arrayref [ + Int (normalised count) + ... + ], + Int (p value) or undef, + Int (adjusted p value) or undef, + Arrayref [ + Int (condition fold change) or undef, + Int (log2 condition fold change) or undef, + ], + Arrayref [ + Arrayref [ + Int (group fold change) or undef, + Int (log2 group fold change) or undef, + ], + ... (groups) + ], + Hashref { + String (genebuild version) => Arrayref [ + Arrayref [ + String (gene stable id), + String (gene name) or undef, + String (gene description) or undef, + String (gene biotype), + Int (distance to 3' end), + Arrayref [ + Arrayref [ + String (transcript stable id), + String (transcript biotype), + ], + ... (transcripts) + ], + ], + ... (genes) + ], + } + ], + ... (regions) + } + Parameters : Arrayref (of regions) + Throws : If regions are missing + Comments : None + +=cut + +sub add_gene_annotation { + my ( $self, $regions ) = @_; + + confess 'No regions specified' if !defined $regions; + + my @output; + + foreach my $region ( @{$regions} ) { + + # Get details for region and 3' end + my $region_seq_name = $region->[0]; + my $region_start = $region->[1]; + my $region_end = $region->[2]; + ## no critic (ProhibitMagicNumbers) + my $three_prime_seq_name = $region->[5]; + my $three_prime_pos = $region->[6]; + my $three_prime_strand = $region->[7]; + ## use critic + + my %gene_annotation = (); + my $genes; + my $distance; + my $nearest_end_pos; + + if ( defined $three_prime_seq_name ) { + + # Find nearest genes to 3' end (taking strand into account) + ( $genes, $distance, $nearest_end_pos ) = + $self->get_nearest_genes( $three_prime_seq_name, $three_prime_pos, + $three_prime_strand ); + } + + # Add annotation if got genes + foreach my $gene ( @{$genes} ) { + my @transcripts; + foreach my $transcript ( @{ $gene->get_all_transcripts() } ) { + + # Only add those transcripts nearest to 3' end + ## no critic (ProhibitMagicNumbers) + if ( + ( + $transcript->strand == 1 + && $transcript->end == $nearest_end_pos + ) + || ( $transcript->strand == -1 + && $transcript->start == $nearest_end_pos ) + ) + { + ## use critic + push @transcripts, + [ $transcript->stable_id, $transcript->biotype, ]; + } + } + push @{ $gene_annotation{ $gene->genebuild_version } }, + [ + $gene->stable_id, $gene->name, $gene->description, + $gene->biotype, $distance, \@transcripts, + ]; + } + + push @{$region}, \%gene_annotation; + push @output, $region; + } + + return \@output; +} + +1; diff --git a/lib/DETCT/Misc/BAM.pm b/lib/DETCT/Misc/BAM.pm new file mode 100644 index 0000000..ae059a0 --- /dev/null +++ b/lib/DETCT/Misc/BAM.pm @@ -0,0 +1,1332 @@ +## no critic (RequireUseStrict, RequireUseWarnings, RequireTidyCode) +package DETCT::Misc::BAM; +## use critic + +# ABSTRACT: Miscellaneous functions for interacting with BAM files + +## Author : is1 +## Maintainer : is1 +## Created : 2012-09-20 +## Last commit by : $Author$ +## Last modified : $Date$ +## Revision : $Revision$ +## Repository URL : $HeadURL$ + +use warnings; +use strict; +use autodie; +use Carp; +use Try::Tiny; + +use Readonly; +use Bio::DB::Sam; +use List::Util qw( min ); +use Data::Compare; +use DETCT::Misc::Tag; + +use base qw( Exporter ); +our @EXPORT_OK = qw( + get_reference_sequence_lengths + get_sequence + count_tags + bin_reads + get_read_peaks + get_three_prime_ends + merge_three_prime_ends + filter_three_prime_ends + choose_three_prime_end + count_reads + merge_read_counts +); + +=head1 SYNOPSIS + + # Brief code examples + +=cut + +# Constants + +# Regexps for checking for polyA +Readonly our @POLYA_REGEXP => ( + qr/\A AAA.AAA... \z/xms, + qr/\A AAA.AA.A.. \z/xms, + qr/\A AAA.A.AA.. \z/xms, + qr/\A AA.AAAA... \z/xms, + qr/\A AA.AAA.A.. \z/xms, + qr/\A AA.A.AAA.. \z/xms, + qr/\A A.AAAAA... \z/xms, + qr/\A A.AAAA.A.. \z/xms, + qr/\A A.AAA.AA.. \z/xms, + qr/\A A.AA.AAA.. \z/xms, + qr/\A A.A.AAAA.. \z/xms, + qr/\A AA.AA.AA.. \z/xms, +); + +=func get_reference_sequence_lengths + + Usage : my %length_of + = DETCT::Misc::BAM::get_reference_sequence_lengths($bam_file); + Purpose : Get length of each reference sequence from a BAM file + Returns : Hash ( + seq_region => length + ) + Parameters : String (the BAM file) + Throws : If BAM file is missing + Comments : None + +=cut + +sub get_reference_sequence_lengths { + my ($bam_file) = @_; + + confess 'No BAM file specified' if !defined $bam_file; + + my $sam = Bio::DB::Sam->new( -bam => $bam_file ); + + my %length_of; + + foreach my $seq_id ( $sam->seq_ids ) { + $length_of{$seq_id} = $sam->length($seq_id); + } + + return %length_of; +} + +=func get_sequence + + Usage : my $seq = DETCT::Misc::BAM::get_sequence( { + fasta_index => $fai, + seq_name => '1', + start => 1, + end => 1000, + strand => 1, + } ); + Purpose : Get sequence from FASTA file + Returns : String (sequence) + Parameters : Hashref { + fasta_index => Bio::DB::Sam::Fai + ref_fasta => String (the FASTA file) + seq_name => String (the sequence name) + start => Int (the sequence start) + end => Int (the sequence end) + strand => Int (the sequence strand) + } + Throws : If FASTA index and file are both missing + If sequence name is missing + If sequence start is missing + If sequence end is missing + If sequence strand is missing + Comments : None + +=cut + +sub get_sequence { + my ($arg_ref) = @_; + + confess 'No FASTA index or FASTA file specified' + if !defined $arg_ref->{fasta_index} && !defined $arg_ref->{ref_fasta}; + confess 'No sequence name specified' if !defined $arg_ref->{seq_name}; + confess 'No sequence start specified' if !defined $arg_ref->{start}; + confess 'No sequence end specified' if !defined $arg_ref->{end}; + confess 'No sequence strand specified' if !defined $arg_ref->{strand}; + + my $fai = + $arg_ref->{fasta_index} + ? $arg_ref->{fasta_index} + : Bio::DB::Sam::Fai->load( $arg_ref->{ref_fasta} ); + + my $query = sprintf '%s:%d-%d', $arg_ref->{seq_name}, $arg_ref->{start}, + $arg_ref->{end}; + + my $seq = uc $fai->fetch($query); + + if ( $arg_ref->{strand} == -1 ) { ## no critic (ProhibitMagicNumbers) + $seq = reverse $seq; + $seq =~ tr/ACGT/TGCA/; + } + + return $seq; +} + +=func count_tags + + Usage : my $count_ref = DETCT::Misc::BAM::count_tags( { + bam_file => $bam_file, + mismatch_threshold => 2, + seq_name => '1', + start => 1, + end => 1000, + tags => ['NNNNBGAGGC', 'NNNNBAGAAG'], + } ); + Purpose : Count tags and random bases in a BAM file + Returns : Hashref { + String (tag) => Hashref { + String (random bases) => Int (count) + } + } + Parameters : Hashref { + bam_file => String (the BAM file) + mismatch_threshold => Int (the mismatch threshold) + seq_name => String (the sequence name) + start => Int (the start) or undef + end => Int (the end) or undef + tags => Arrayref of strings (the tags) + } + Throws : If BAM file is missing + If mismatch threshold is missing + If tags are missing + Comments : None + +=cut + +sub count_tags { + my ($arg_ref) = @_; + + confess 'No BAM file specified' if !defined $arg_ref->{bam_file}; + confess 'No mismatch threshold specified' + if !defined $arg_ref->{mismatch_threshold}; + confess 'No sequence name specified' if !defined $arg_ref->{seq_name}; + confess 'No tags specified' if !defined $arg_ref->{tags}; + + my @tags = @{ $arg_ref->{tags} }; + + # Convert tags to regular expressions + my %re_for = DETCT::Misc::Tag::convert_tag_to_regexp(@tags); + + # Count random bases per tag + my %random_count_for; + foreach my $tag (@tags) { + my @random = $tag =~ m/[NRYKMSWBDHV]/xmsg; + $random_count_for{$tag} = scalar @random; + } + + my $sam = Bio::DB::Sam->new( -bam => $arg_ref->{bam_file} ); + + my %count; + + # Callback for filtering + my $callback = sub { + my ($alignment) = @_; + return if !is_read2($alignment); + return if is_duplicate($alignment); + return if $alignment->unmapped; + return + if above_mismatch_threshold( $alignment, + $arg_ref->{mismatch_threshold} ); + + # Match tag + my ($tag_in_read) = $alignment->query->name =~ m/[#] ([AGCT]+) \z/xmsg; + return if !$tag_in_read; + TAG: foreach my $tag ( sort keys %re_for ) { + my $regexps = $re_for{$tag}; + foreach my $re ( @{$regexps} ) { + if ( $tag_in_read =~ $re ) { + my $random = substr $tag_in_read, 0, + $random_count_for{$tag}; + $count{$tag}{$random}++; + last TAG; + } + } + } + + return; + }; + + # Construct region + my $region = $arg_ref->{seq_name}; + if ( exists $arg_ref->{start} ) { + $region .= q{:} . $arg_ref->{start}; + if ( exists $arg_ref->{end} ) { + $region .= q{-} . $arg_ref->{end}; + } + } + + $sam->fetch( $region, $callback ); + + return \%count; +} + +=func bin_reads + + Usage : my $bin_ref = DETCT::Misc::BAM::bin_reads( { + bam_file => $bam_file, + mismatch_threshold => 2, + bin_size => 100, + seq_name => '1', + tags => ['NNNNBGAGGC', 'NNNNBAGAAG'], + } ); + Purpose : Bin reads in a BAM file + Returns : Hashref { + Int (bin) => Int (count) + } + Parameters : Hashref { + bam_file => String (the BAM file) + mismatch_threshold => Int (the mismatch threshold) + bin_size => Int (the bin size) + seq_name => String (the sequence name) + tags => Arrayref of strings (the tags) + } + Throws : If BAM file is missing + If mismatch threshold is missing + If bin size is missing + If sequence name is missing + If tags are missing + Comments : None + +=cut + +sub bin_reads { + my ($arg_ref) = @_; + + confess 'No BAM file specified' if !defined $arg_ref->{bam_file}; + confess 'No mismatch threshold specified' + if !defined $arg_ref->{mismatch_threshold}; + confess 'No bin size specified' if !defined $arg_ref->{bin_size}; + confess 'No sequence name specified' if !defined $arg_ref->{seq_name}; + confess 'No tags specified' if !defined $arg_ref->{tags}; + + my @tags = @{ $arg_ref->{tags} }; + + # Convert tags to regular expressions + my %re_for = DETCT::Misc::Tag::convert_tag_to_regexp(@tags); + + my $sam = Bio::DB::Sam->new( -bam => $arg_ref->{bam_file} ); + + my %read_count_for; + + # Callback for filtering + my $callback = sub { + my ($alignment) = @_; + return if !is_read2($alignment); + return if is_duplicate($alignment); + return if $alignment->unmapped; + return + if above_mismatch_threshold( $alignment, + $arg_ref->{mismatch_threshold} ); + return if !matched_tag( $alignment, \%re_for ); + + # Read can span multiple bins + my $start_bin = int( ( $alignment->start - 1 ) / $arg_ref->{bin_size} ); + my $end_bin = int( ( $alignment->end - 1 ) / $arg_ref->{bin_size} ); + + foreach my $bin ( $start_bin .. $end_bin ) { + $read_count_for{$bin}++; + } + + return; + }; + + $sam->fetch( $arg_ref->{seq_name}, $callback ); + + return { $arg_ref->{seq_name} => \%read_count_for }; +} + +=func get_read_peaks + + Usage : my $peaks_ref = DETCT::Misc::BAM::get_read_peaks( { + bam_file => $bam_file, + mismatch_threshold => 2, + peak_buffer_width => 100, + seq_name => '1', + tags => ['NNNNBGAGGC', 'NNNNBAGAAG'], + } ); + Purpose : Get read peaks (overlapping reads) for a BAM file + Returns : Hashref { + String (sequence name) => Arrayref [ + Arrayref [ + Int (peak start), + Int (peak end), + Int (peak read count), + ], + ... (peaks) + ] + } + Parameters : Hashref { + bam_file => String (the BAM file) + mismatch_threshold => Int (the mismatch threshold) + peak_buffer_width => Int (the peak buffer size), + seq_name => String (the sequence name) + tags => Arrayref of strings (the tags) + } + Throws : If BAM file is missing + If mismatch threshold is missing + If peak buffer width is missing + If sequence name is missing + If tags are missing + Comments : BAM file must be sorted by coordinate + +=cut + +sub get_read_peaks { + my ($arg_ref) = @_; + + confess 'No BAM file specified' if !defined $arg_ref->{bam_file}; + confess 'No mismatch threshold specified' + if !defined $arg_ref->{mismatch_threshold}; + confess 'No peak buffer width specified' + if !defined $arg_ref->{peak_buffer_width}; + confess 'No sequence name specified' if !defined $arg_ref->{seq_name}; + confess 'No tags specified' if !defined $arg_ref->{tags}; + + my @tags = @{ $arg_ref->{tags} }; + + # Convert tags to regular expressions + my %re_for = DETCT::Misc::Tag::convert_tag_to_regexp(@tags); + + my $sam = Bio::DB::Sam->new( -bam => $arg_ref->{bam_file} ); + + # Peak variables + my @peaks; + my $current_peak_read_count; + my $current_peak_start; + my $current_peak_end; + + # Read variables + my $current_read_start; + my $current_read_end; + + # Callback for filtering + my $callback = sub { + my ($alignment) = @_; + return if !is_read2($alignment); + return if is_duplicate($alignment); + return if $alignment->unmapped; + return + if above_mismatch_threshold( $alignment, + $arg_ref->{mismatch_threshold} ); + return if !matched_tag( $alignment, \%re_for ); + + $current_read_start = $alignment->start; + $current_read_end = $alignment->end; + + # We're starting the first peak + if ( !defined $current_peak_start ) { + $current_peak_start = $current_read_start; + $current_peak_end = $current_read_end; + $current_peak_read_count = 1; + return; + } + + # Extend or finish current peak? + if ( $current_read_start - $current_peak_end < + $arg_ref->{peak_buffer_width} ) + { + # Extend current peak + $current_peak_end = $current_read_end; + $current_peak_read_count++; + } + else { + # Finish current peak + push @peaks, + [ + $current_peak_start, $current_peak_end, + $current_peak_read_count + ]; + + # Start new peak + $current_peak_start = $current_read_start; + $current_peak_end = $current_read_end; + $current_peak_read_count = 1; + } + + return; + }; + + # Identify peaks (where peaks are read 2s separated by a buffer of specific + # size) + $sam->fetch( $arg_ref->{seq_name}, $callback ); + + # Finish last peak + if ($current_peak_read_count) { + push @peaks, + [ $current_peak_start, $current_peak_end, $current_peak_read_count ]; + } + + return { $arg_ref->{seq_name} => \@peaks }; +} + +=func get_three_prime_ends + + Usage : my $three_prime_ref = DETCT::Misc::BAM::get_three_prime_ends( { + bam_file => $bam_file, + mismatch_threshold => 2, + seq_name => '1', + tags => ['NNNNBGAGGC', 'NNNNBAGAAG'], + regions => $regions_ary_ref, + } ); + Purpose : Get all 3' ends for a list of regions + Returns : Hashref { + String (sequence name) => Arrayref [ + Arrayref [ + Int (region start), + Int (region end), + Int (region maximum read count), + Float (region log probability sum), + Arrayref [ + Arrayref [ + String (3' end sequence name), + Int (3' end position), + Int (3' end strand), + Int (3' end read count), + ], + ... (3' ends) + ], + ], + ... (regions) + } + Parameters : Hashref { + bam_file => String (the BAM file) + mismatch_threshold => Int (the mismatch threshold) + seq_name => String (the sequence name) + tags => Arrayref of strings (the tags) + regions => Arrayref (of regions) + } + Throws : If BAM file is missing + If mismatch threshold is missing + If sequence name is missing + If tags are missing + If regions are missing + Comments : regions parameter is a list of regions, unlike the regions + parameter for merge_three_prime_ends where it is a list of lists + of regions + +=cut + +sub get_three_prime_ends { + my ($arg_ref) = @_; + + confess 'No BAM file specified' if !defined $arg_ref->{bam_file}; + confess 'No mismatch threshold specified' + if !defined $arg_ref->{mismatch_threshold}; + confess 'No sequence name specified' if !defined $arg_ref->{seq_name}; + confess 'No tags specified' if !defined $arg_ref->{tags}; + confess 'No regions specified' if !defined $arg_ref->{regions}; + + my @tags = @{ $arg_ref->{tags} }; + + # Convert tags to regular expressions + my %re_for = DETCT::Misc::Tag::convert_tag_to_regexp(@tags); + + my $sam = Bio::DB::Sam->new( -bam => $arg_ref->{bam_file} ); + + my @regions_with_three_prime_ends; + + foreach my $region ( @{ $arg_ref->{regions} } ) { + my ( $start, $end, $max_read_count, $log_prob_sum ) = @{$region}; + + my %count_for; + + # Get all second reads in region + my $read2_alignments = $sam->features( + -seq_id => $arg_ref->{seq_name}, + -start => $start, + -end => $end, + -flags => { SECOND_MATE => 1 }, + -iterator => 1, + ); + + # Get all 3' ends + while ( my $alignment = $read2_alignments->next_seq ) { + next if is_duplicate($alignment); + + # next if $alignment->unmapped; # Not needed; always mapped + next if $alignment->munmapped; # Want read 1 mapped too + next + if above_mismatch_threshold( $alignment, + $arg_ref->{mismatch_threshold} ); + next if !matched_tag( $alignment, \%re_for ); + + # Skip if 3' end is on a different chromosome + # Hopefully not significant number of real 3' ends on different + # chromosomes because are hard to deal with + # If reads are on different chromosomes then TLEN will be 0 and + # mate_end will return undefined (i.e. can't get 3' end position + # without querying by read name, which is slow for a BAM file + # sorted by coordinate) + next if $alignment->mate_seq_id ne $arg_ref->{seq_name}; + + # Identify 3' end position and strand based on alignment of read 1 + my $three_prime_seq = $alignment->mate_seq_id; + my $three_prime_pos; + my $three_prime_strand; + if ( $alignment->mstrand == 1 ) { + $three_prime_pos = $alignment->mate_start; + $three_prime_strand = -1; ## no critic (ProhibitMagicNumbers) + } + else { + $three_prime_pos = $alignment->mate_end; + $three_prime_strand = 1; + } + + # Count number of reads supporting each 3' end + my $three_prime = join q{:}, $three_prime_seq, $three_prime_pos, + $three_prime_strand; + $count_for{$three_prime}++; + } + + # Turn counts into an array + my @three_prime_ends; + foreach my $three_prime ( + reverse sort { $count_for{$a} <=> $count_for{$b} } + keys %count_for + ) + { + my ( $seq, $pos, $strand ) = split /:/xms, $three_prime; + push @three_prime_ends, + [ $seq, $pos, $strand, $count_for{$three_prime} ]; + } + + # Add three prime ends to regions + push @regions_with_three_prime_ends, + [ $start, $end, $max_read_count, $log_prob_sum, \@three_prime_ends, ]; + } + + return { $arg_ref->{seq_name} => \@regions_with_three_prime_ends }; +} + +=func merge_three_prime_ends + + Usage : my $three_prime_ref + = DETCT::Misc::BAM::merge_three_prime_ends( { + seq_name => '1', + regions => $regions_ary_ref, + } ); + Purpose : Merge multiple lists of regions with 3' ends + Returns : Hashref { + String (sequence name) => Arrayref [ + Arrayref [ + Int (region start), + Int (region end), + Int (region maximum read count), + Float (region log probability sum), + Arrayref [ + Arrayref [ + String (3' end sequence name), + Int (3' end position), + Int (3' end strand), + Int (3' end read count), + ], + ... (3' ends) + ], + ], + ... (regions) + } + Parameters : Hashref { + seq_name => String (the sequence name) + regions => Arrayref (of arrayrefs of regions) + } + Throws : If sequence name is missing + If regions are missing + If each list of regions doesn't have same number of regions + If regions are not in the same order or not the same in each + list + Comments : regions parameter is a list of lists of regions, unlike + the regions parameter for get_three_prime_ends where it is a + list of regions + +=cut + +sub merge_three_prime_ends { + my ($arg_ref) = @_; + + confess 'No sequence name specified' if !defined $arg_ref->{seq_name}; + confess 'No regions specified' if !defined $arg_ref->{regions}; + + my @list_of_lists_of_regions = @{ $arg_ref->{regions} }; + + # No need to merge if only one list of regions + my $num_lists = scalar @list_of_lists_of_regions; + if ( $num_lists == 1 ) { + return { $arg_ref->{seq_name} => $list_of_lists_of_regions[0] }; + } + + # Ensure each list has same number of regions as first list + my $num_regions1 = scalar @{ $list_of_lists_of_regions[0] }; + foreach my $list_index ( 1 .. $num_lists - 1 ) { + my $num_regions2 = scalar @{ $list_of_lists_of_regions[$list_index] }; + if ( $num_regions1 != $num_regions2 ) { + confess 'Number of regions does not match in all lists'; + } + } + + my @regions_with_three_prime_ends; + + # Merge all lists + foreach my $region_index ( 0 .. $num_regions1 - 1 ) { + + # Ensure region from first list is same in each list + my $region1 = $list_of_lists_of_regions[0]->[$region_index]; + my ( $start1, $end1, $max_read_count1, $log_prob_sum1 ) = @{$region1}; + foreach my $list_index ( 1 .. $num_lists - 1 ) { + my $region2 = + $list_of_lists_of_regions[$list_index]->[$region_index]; + my ( $start2, $end2, $max_read_count2, $log_prob_sum2 ) = + @{$region2}; + if ( $start1 != $start2 + || $end1 != $end2 + || $max_read_count1 != $max_read_count2 + || $log_prob_sum1 != $log_prob_sum2 ) + { + confess + 'Regions not in the same order or not the same in each list'; + } + } + + # Get all the 3' ends + my @unmerged_three_prime_ends; + foreach my $list_index ( 0 .. $num_lists - 1 ) { + my $list = $list_of_lists_of_regions[$list_index]; + my $region = $list->[$region_index]; + my ( undef, undef, undef, undef, $three_prime_ends ) = @{$region}; + push @unmerged_three_prime_ends, @{$three_prime_ends}; + } + + # Add up counts for identical 3' ends + my %count_for; + foreach my $three_prime_end (@unmerged_three_prime_ends) { + my ( $seq, $pos, $strand, $read_count ) = @{$three_prime_end}; + my $three_prime = join q{:}, $seq, $pos, $strand; + $count_for{$three_prime} += $read_count; + } + + # Turn counts into an array + my @three_prime_ends; + foreach my $three_prime ( + reverse sort { $count_for{$a} <=> $count_for{$b} } + keys %count_for + ) + { + my ( $seq, $pos, $strand ) = split /:/xms, $three_prime; + push @three_prime_ends, + [ $seq, $pos, $strand, $count_for{$three_prime} ]; + } + + # Add three prime ends to regions + push @regions_with_three_prime_ends, + [ + $start1, $end1, $max_read_count1, + $log_prob_sum1, \@three_prime_ends, + ]; + + $region_index++; + } + + return { $arg_ref->{seq_name} => \@regions_with_three_prime_ends }; +} + +=func filter_three_prime_ends + + Usage : my $three_prime_ref + = DETCT::Misc::BAM::filter_three_prime_ends( { + analysis => $analysis, + seq_name => '1', + regions => $regions_ary_ref, + } ); + Purpose : Filter list of regions with 3' ends + Returns : Hashref { + String (sequence name) => Arrayref [ + Arrayref [ + Int (region start), + Int (region end), + Int (region maximum read count), + Float (region log probability sum), + Arrayref [ + Arrayref [ + String (3' end sequence name), + Int (3' end position), + Int (3' end strand), + Int (3' end read count), + ], + ... (3' ends) + ], + ], + ... (regions) + } + Parameters : Hashref { + analysis => DETCT::Analysis + seq_name => String (the sequence name) + regions => Arrayref (of regions) + } + Throws : If analysis is missing + If sequence name is missing + If regions are missing + Comments : regions parameter is a list of regions, unlike the regions + parameter for merge_three_prime_ends where it is a list of lists + of regions + +=cut + +sub filter_three_prime_ends { + my ($arg_ref) = @_; + + confess 'No analysis specified' if !defined $arg_ref->{analysis}; + confess 'No sequence name specified' if !defined $arg_ref->{seq_name}; + confess 'No regions specified' if !defined $arg_ref->{regions}; + + my @regions_with_three_prime_ends; + + # Iterate over regions + foreach my $region ( @{ $arg_ref->{regions} } ) { + my ( $region_start, $region_end, $region_max_read_count, + $region_log_prob_sum, $unfiltered_three_prime_ends ) + = @{$region}; + + # Filter 3' ends + my @three_prime_ends; + foreach my $three_prime_end ( @{$unfiltered_three_prime_ends} ) { + my ( $seq_name, $pos, $strand, $read_count ) = @{$three_prime_end}; + + # Must be supported by more than 3 reads + next if $read_count <= 3; ## no critic (ProhibitMagicNumbers) + + # Check 10 bp downstream of 3' end for polyA + my $ten_bp_start; + my $ten_bp_end; + if ( $strand == 1 ) { + $ten_bp_start = $pos + 1; + $ten_bp_end = $pos + 10; ## no critic (ProhibitMagicNumbers) + } + else { + $ten_bp_start = $pos - 10; ## no critic (ProhibitMagicNumbers) + $ten_bp_end = $pos - 1; + } + my $ten_bp_seq = + $arg_ref->{analysis} + ->get_subsequence( $seq_name, $ten_bp_start, $ten_bp_end, + $strand ); + + # Check if 10 bp downstream is polyA + next if is_polya($ten_bp_seq); + + push @three_prime_ends, $three_prime_end; + } + + # Add three prime ends to regions + push @regions_with_three_prime_ends, + [ + $region_start, $region_end, + $region_max_read_count, $region_log_prob_sum, + \@three_prime_ends, + ]; + } + + return { $arg_ref->{seq_name} => \@regions_with_three_prime_ends }; +} + +=func choose_three_prime_end + + Usage : my $three_prime_ref + = DETCT::Misc::BAM::choose_three_prime_end( { + seq_name => '1', + regions => $regions_ary_ref, + } ); + Purpose : Filter and adjust list of regions and choose best 3' end + Returns : Hashref { + String (sequence name) => Arrayref [ + Arrayref [ + Int (region start), + Int (region end), + Int (region maximum read count), + Float (region log probability sum), + String (3' end sequence name) or undef, + Int (3' end position) or undef, + Int (3' end strand) or undef, + Int (3' end read count) or undef, + ], + ... (regions) + } + Parameters : Hashref { + seq_name => String (the sequence name) + regions => Arrayref (of regions) + } + Throws : If sequence name is missing + If regions are missing + Comments : regions parameter is a list of regions, unlike the regions + parameter for merge_three_prime_ends where it is a list of lists + of regions + +=cut + +sub choose_three_prime_end { + my ($arg_ref) = @_; + + confess 'No sequence name specified' if !defined $arg_ref->{seq_name}; + confess 'No regions specified' if !defined $arg_ref->{regions}; + + my @regions_with_three_prime_ends; + + # Iterate over regions + foreach my $region ( @{ $arg_ref->{regions} } ) { + my ( $region_start, $region_end, $region_max_read_count, + $region_log_prob_sum, $three_prime_ends ) + = @{$region}; + + my ( + $three_prime_seq_name, $three_prime_pos, + $three_prime_strand, $three_prime_read_count + ); + + @{$three_prime_ends} = reverse sort { + _sort_three_prime_end( $a, $b, $arg_ref->{seq_name}, $region_start, + $region_end ) + } @{$three_prime_ends}; + + # Get best 3' end (highest read count) + if ( @{$three_prime_ends} ) { + ( + $three_prime_seq_name, $three_prime_pos, + $three_prime_strand, $three_prime_read_count + ) = @{ $three_prime_ends->[0] }; + } + + # Reduce size of region if appropriate + ## no critic (ProhibitMagicNumbers) + if ( defined $three_prime_seq_name + && $three_prime_seq_name eq $arg_ref->{seq_name} ) + { + if ( $three_prime_strand == 1 + && $three_prime_pos < $region_end + && $three_prime_pos > $region_start ) + { + $region_end = $three_prime_pos; + } + elsif ($three_prime_strand == -1 + && $three_prime_pos > $region_start + && $three_prime_pos < $region_end ) + { + $region_start = $three_prime_pos; + } + } + ## use critic + + # Add three prime ends to regions + push @regions_with_three_prime_ends, + [ + $region_start, $region_end, + $region_max_read_count, $region_log_prob_sum, + $three_prime_seq_name, $three_prime_pos, + $three_prime_strand, $three_prime_read_count, + ]; + } + + return { $arg_ref->{seq_name} => \@regions_with_three_prime_ends }; +} + +# Sort by read count then distance to region +sub _sort_three_prime_end { + my ( $a, $b, $seq_name, $region_start, $region_end ) = @_; + + my $seq_name_a = $a->[0]; + my $seq_name_b = $b->[0]; + my $pos_a = $a->[1]; + my $pos_b = $b->[1]; + ## no critic (ProhibitMagicNumbers) + my $read_count_a = $a->[3]; + my $read_count_b = $b->[3]; + ## use critic + + # Get minimum distance to region + my $dist_a = min( abs $region_start - $pos_a, abs $region_end - $pos_a ); + my $dist_b = min( abs $region_start - $pos_b, abs $region_end - $pos_b ); + + # Make sure 3' end is on same chromosome as region + # (1e+100 is bigger than any chromosome can be to ensure sorting last) + if ( $seq_name_a ne $seq_name ) { + $dist_a = 1e+100; ## no critic (ProhibitMagicNumbers) + } + if ( $seq_name_b ne $seq_name ) { + $dist_b = 1e+100; ## no critic (ProhibitMagicNumbers) + } + + return $read_count_a <=> $read_count_b || $dist_b <=> $dist_a; +} + +=func count_reads + + Usage : my $count_ref = DETCT::Misc::BAM::count_reads( { + bam_file => $bam_file, + mismatch_threshold => 2, + seq_name => '1', + regions => $regions_ary_ref, + tags => ['NNNNBGAGGC', 'NNNNBAGAAG'], + } ); + Purpose : Count reads in regions of a BAM file + Returns : Hashref { + String (sequence name) => Arrayref [ + Arrayref [ + Int (region start), + Int (region end), + Int (region maximum read count), + Float (region log probability sum), + String (3' end sequence name) or undef, + Int (3' end position) or undef, + Int (3' end strand) or undef, + Int (3' end read count) or undef, + Hashref { + String (tag) => Int (count) + } + ], + ... (regions) + } + Parameters : Hashref { + bam_file => String (the BAM file) + mismatch_threshold => Int (the mismatch threshold) + seq_name => String (the sequence name) or undef + regions => Arrayref (of regions) + tags => Arrayref of strings (the tags) + } + Throws : If BAM file is missing + If mismatch threshold is missing + If sequence name is missing + If regions are missing + If tags are missing + Comments : regions parameter is a list of regions, unlike the regions + parameter for merge_read_counts where it is a hash keyed by BAM + file with values being lists of regions + +=cut + +sub count_reads { + my ($arg_ref) = @_; + + confess 'No BAM file specified' if !defined $arg_ref->{bam_file}; + confess 'No mismatch threshold specified' + if !defined $arg_ref->{mismatch_threshold}; + confess 'No sequence name specified' if !defined $arg_ref->{seq_name}; + confess 'No regions specified' if !defined $arg_ref->{regions}; + confess 'No tags specified' if !defined $arg_ref->{tags}; + + my @tags = @{ $arg_ref->{tags} }; + + # Convert tags to regular expressions + my %re_for = DETCT::Misc::Tag::convert_tag_to_regexp(@tags); + + my $sam = Bio::DB::Sam->new( -bam => $arg_ref->{bam_file} ); + + my @regions_with_three_prime_ends; + + # Iterate over regions + foreach my $region ( @{ $arg_ref->{regions} } ) { + my ( + $region_start, $region_end, + $region_max_read_count, $region_log_prob_sum, + $three_prime_seq_name, $three_prime_pos, + $three_prime_strand, $three_prime_read_count + ) = @{$region}; + + my %count = map { $_ => 0 } @tags; + + # Get first read from each pair + my $read2_alignments = $sam->features( + -seq_id => $arg_ref->{seq_name}, + -start => $region_start, + -end => $region_end, + -flags => { SECOND_MATE => 1 }, + -iterator => 1, + ); + while ( my $alignment = $read2_alignments->next_seq ) { + next if is_duplicate($alignment); + + #next if $alignment->unmapped; # Not needed; always mapped + next + if above_mismatch_threshold( $alignment, + $arg_ref->{mismatch_threshold} ); + + # Match tag + my ($tag_in_read) = + $alignment->query->name =~ m/[#] ([AGCT]+) \z/xmsg; + next if !$tag_in_read; + TAG: foreach my $tag ( sort keys %re_for ) { + my $regexps = $re_for{$tag}; + foreach my $re ( @{$regexps} ) { + if ( $tag_in_read =~ $re ) { + $count{$tag}++; + last TAG; + } + } + } + } + + # Add read counts to regions + push @regions_with_three_prime_ends, + [ + $region_start, $region_end, + $region_max_read_count, $region_log_prob_sum, + $three_prime_seq_name, $three_prime_pos, + $three_prime_strand, $three_prime_read_count, + \%count, + ]; + } + + return { $arg_ref->{seq_name} => \@regions_with_three_prime_ends }; +} + +=func merge_read_counts + + Usage : my $count_ref + = DETCT::Misc::BAM::merge_read_counts( { + seq_name => '1', + regions => $regions_hash_ref, + samples => $samples_ary_ref, + } ); + Purpose : Merge multiple lists of regions with read counts + Returns : Hashref { + String (sequence name) => Arrayref [ + Arrayref [ + Int (region start), + Int (region end), + Int (region maximum read count), + Float (region log probability sum), + String (3' end sequence name) or undef, + Int (3' end position) or undef, + Int (3' end strand) or undef, + Int (3' end read count) or undef, + Arrayref [ + Int (count) + ... + ] + ], + ... (regions) + } + Parameters : Hashref { + seq_name => String (the sequence name) + regions => Arrayref (of arrayrefs of regions) + samples => Arrayref (of samples) + } + Throws : If sequence name is missing + If regions are missing + If samples are missing + If each list of regions doesn't have same number of regions + If regions are not in the same order or not the same in each + list + Comments : regions parameter is a hash keyed by BAM file with values being + lists of regions, unlike the regions parameter for count_reads + where it is a list of regions + +=cut + +sub merge_read_counts { + my ($arg_ref) = @_; + + confess 'No sequence name specified' if !defined $arg_ref->{seq_name}; + confess 'No regions specified' if !defined $arg_ref->{regions}; + confess 'No samples specified' if !defined $arg_ref->{samples}; + + my %hash_of_lists_of_regions = %{ $arg_ref->{regions} }; + + # Ensure each list has same number of regions + my @bam_files = keys %hash_of_lists_of_regions; + my $num_regions1 = scalar @{ $hash_of_lists_of_regions{ $bam_files[0] } }; + foreach my $list_index ( 1 .. scalar @bam_files - 1 ) { + my $num_regions2 = + scalar @{ $hash_of_lists_of_regions{ $bam_files[$list_index] } }; + if ( $num_regions1 != $num_regions2 ) { + confess 'Number of regions does not match in all lists'; + } + } + + # Get index for each sample + my %sample_index_for; + my $index = 0; + foreach my $sample ( @{ $arg_ref->{samples} } ) { + my $bam_file = $sample->bam_file; + my $tag = $sample->tag; + $sample_index_for{$bam_file}{$tag} = $index; + $index++; + } + + my @regions_with_three_prime_ends; + + # Merge all lists + foreach my $region_index ( 0 .. $num_regions1 - 1 ) { + + # Ensure regions are the same in each list + my $region1 = + $hash_of_lists_of_regions{ $bam_files[0] }->[$region_index]; + my @region1 = @{$region1}[ 0 .. 7 ]; ## no critic (ProhibitMagicNumbers) + foreach my $list_index ( 1 .. scalar @bam_files - 1 ) { + my $region2 = + $hash_of_lists_of_regions{ $bam_files[$list_index] } + ->[$region_index]; + + # Check first 8 fields of each region are identical + my @region2 = + @{$region2}[ 0 .. 7 ]; ## no critic (ProhibitMagicNumbers) + if ( !Compare( \@region1, \@region2 ) ) { + confess + 'Regions not in the same order or not the same in each list'; + } + } + + my @read_counts; + + # Get read count for each BAM file / tag + foreach my $bam_file (@bam_files) { + my $region = $hash_of_lists_of_regions{$bam_file}->[$region_index]; + my $read_counts_ref = $region->[-1]; # Read counts are last field + foreach my $tag ( keys %{$read_counts_ref} ) { + my $read_count = $read_counts_ref->{$tag}; + if ( !exists $sample_index_for{$bam_file}{$tag} ) { + confess "Unknown BAM file ($bam_file) / tag ($tag) pair"; + } + my $sample_index = $sample_index_for{$bam_file}{$tag}; + $read_counts[$sample_index] = $read_count; + } + } + + push @regions_with_three_prime_ends, [ @region1, \@read_counts ]; + + $region_index++; + } + + return { $arg_ref->{seq_name} => \@regions_with_three_prime_ends }; +} + +=func matched_tag + + Usage : next if !matched_tag($alignment, \%re_for); + Purpose : Check if alignment doesn't match required tags + Returns : 1 or 0 + Parameters : Bio::DB::Bam::Alignment or Bio::DB::Bam::AlignWrapper + : Hashref of regular expressions + Throws : No exceptions + Comments : None + +=cut + +sub matched_tag { + my ( $alignment, $re_for ) = @_; + + my $got_match = 0; + + # Match tag + my ($tag_in_read) = $alignment->query->name =~ m/[#] ([AGCT]+) \z/xmsg; + if ($tag_in_read) { + TAG: foreach my $tag ( sort keys %{$re_for} ) { + my $regexps = $re_for->{$tag}; + foreach my $re ( @{$regexps} ) { + if ( $tag_in_read =~ $re ) { + $got_match = 1; + last TAG; + } + } + } + } + + return $got_match; +} + +=func is_read2 + + Usage : next if is_read2($alignment); + Purpose : Check if alignment is from read 2 (not read 1) + Returns : 1 or 0 + Parameters : Bio::DB::Bam::AlignWrapper + Throws : No exceptions + Comments : None + +=cut + +sub is_read2 { + my ($alignment) = @_; + + return ( $alignment->get_tag_values('FLAGS') =~ m/\bSECOND_MATE\b/xms ) + ? 1 + : 0; +} + +=func is_duplicate + + Usage : next if is_duplicate($alignment); + Purpose : Check if alignment is marked as a duplicate + Returns : 1 or 0 + Parameters : Bio::DB::Bam::AlignWrapper + Throws : No exceptions + Comments : None + +=cut + +sub is_duplicate { + my ($alignment) = @_; + + return ( $alignment->get_tag_values('FLAGS') =~ m/\bDUPLICATE\b/xms ) + ? 1 + : 0; +} + +=func above_mismatch_threshold + + Usage : next if above_mismatch_threshold($alignment, 2); + Purpose : Check if alignment has too many mismatches + Returns : 1 or 0 + Parameters : Bio::DB::Bam::Alignment or Bio::DB::Bam::AlignWrapper + : Int (mismatch threshold) + Throws : No exceptions + Comments : None + +=cut + +sub above_mismatch_threshold { + my ( $alignment, $threshold ) = @_; + + # Count soft clipped bases + my $cigar_ref = $alignment->cigar_array; + my $soft_clipped_bases = 0; + foreach my $pair_ref ( @{$cigar_ref} ) { + my ( $op, $count ) = @{$pair_ref}; + if ( $op eq q{S} ) { + $soft_clipped_bases += $count; + } + } + + # Get edit distance / number of mismatches + my $nm = $alignment->aux_get('NM'); + + # Check if above mismatch threshold + return ( $nm + $soft_clipped_bases > $threshold ) ? 1 : 0; +} + +=func is_polya + + Usage : next if is_polya($seq); + Purpose : Check if sequence contains polyA + Returns : 1 or 0 + Parameters : String (sequence) + Throws : No exceptions + Comments : None + +=cut + +sub is_polya { + my ($seq) = @_; + + my $is_polya = 0; + + # Check for more than 3 As at start + if ( $seq =~ m/\A AAAA /xms ) { + $is_polya = 1; + } + + # Check for more than 6 As in total + if ( !$is_polya ) { + my $a = $seq =~ tr/A/A/; + if ( $a > 6 ) { ## no critic (ProhibitMagicNumbers) + $is_polya = 1; + } + } + + # Check specific patterns for polyA + if ( !$is_polya ) { + foreach my $regexp (@POLYA_REGEXP) { + if ( $seq =~ $regexp ) { + $is_polya = 1; + last; + } + } + } + + return $is_polya; +} + +1; diff --git a/lib/DETCT/Misc/Output.pm b/lib/DETCT/Misc/Output.pm new file mode 100644 index 0000000..18fb537 --- /dev/null +++ b/lib/DETCT/Misc/Output.pm @@ -0,0 +1,738 @@ +## no critic (RequireUseStrict, RequireUseWarnings, RequireTidyCode) +package DETCT::Misc::Output; +## use critic + +# ABSTRACT: Miscellaneous functions for outputting data + +## Author : is1 +## Maintainer : is1 +## Created : 2012-11-25 +## Last commit by : $Author$ +## Last modified : $Date$ +## Revision : $Revision$ +## Repository URL : $HeadURL$ + +use warnings; +use strict; +use autodie; +use Carp; +use Try::Tiny; + +use Readonly; +use File::Spec; +use File::Path qw( make_path ); +use Sort::Naturally; +use List::MoreUtils qw( uniq all ); + +use base qw( Exporter ); +our @EXPORT_OK = qw( + dump_as_table +); + +=head1 SYNOPSIS + + # Brief code examples + +=cut + +# Constants + +# Types +Readonly our $STRING => 1; +Readonly our $INT => 2; +Readonly our $FLOAT => 3; + +# Output formats +Readonly our @FORMATS => qw( csv tsv html ); + +=func dump_as_table + + Usage : DETCT::Misc::Output::dump_as_table( { + analysis => $analysis, + dir => '.', + regions => $regions_hash_ref, + } ); + Purpose : Dump regions in tabular format + Returns : undef + Parameters : Hashref { + analysis => DETCT::Analysis, + dir => String (the working directory), + regions => Arrayref (of regions), + } + Throws : If analysis is missing + If regions are missing + If directory is missing + Comments : None + +=cut + +sub dump_as_table { + my ($arg_ref) = @_; + + confess 'No analysis specified' if !defined $arg_ref->{analysis}; + confess 'No directory specified' if !defined $arg_ref->{dir}; + confess 'No regions specified' if !defined $arg_ref->{regions}; + + # Get conditions and groups + my @samples = @{ $arg_ref->{analysis}->get_all_samples() }; + my @conditions = uniq( nsort( map { $_->condition } @samples ) ); + my @groups = + grep { defined $_ } uniq( nsort( map { $_->group } @samples ) ); + + # Get regions sorted by p value then location + my $regions = sort_regions( $arg_ref->{regions} ); + + # Get genebuild version + my $genebuild_version; + foreach my $region ( @{$regions} ) { + ## no critic (ProhibitMagicNumbers) + ($genebuild_version) = ( sort keys %{ $region->[15] } )[-1]; # Highest + ## use critic + last if $genebuild_version; + } + + # Get definition for all columns (which determines formatting) + my $definition = get_definition( $arg_ref->{analysis}->ensembl_species, + $genebuild_version, \@samples, \@conditions, \@groups ); + + # Make sure working directory exists + if ( !-d $arg_ref->{dir} ) { + make_path( $arg_ref->{dir} ); + } + + # Open filehandles and begin all output files + my $fh = begin_output( $arg_ref->{dir}, $definition ); + + foreach my $region ( @{$regions} ) { + my @row; + + # Region + my $seq_name = $region->[0]; + my $start = $region->[1]; + my $end = $region->[2]; + push @row, [ $seq_name, [ $seq_name, $start, $end, $seq_name ] ]; + push @row, [ $start, [ $seq_name, $start, $end, $start ] ]; + push @row, [ $end, [ $seq_name, $start, $end, $end ] ]; + + # 3' end + ## no critic (ProhibitMagicNumbers) + my $tpe_seq_name = $region->[5]; + my $tpe_pos = $region->[6]; + my $tpe_strand = $region->[7]; + my $tpe_read_count = $region->[8]; + ## use critic + push @row, + [ $tpe_pos, [ $tpe_seq_name, $tpe_pos, $tpe_pos, $tpe_pos ] ]; + push @row, [$tpe_strand]; + push @row, [$tpe_read_count]; + + # p values + ## no critic (ProhibitMagicNumbers) + my $pval = $region->[11]; + my $padj = $region->[12]; + ## use critic + push @row, [$pval]; + push @row, [$padj]; + + # Gene details + ## no critic (ProhibitMagicNumbers) + my %gene = %{ $region->[15] }; + my ($genebuild) = ( sort keys %gene )[-1]; # Highest + ## use critic + my @distance; + my ( @gene_stable_id, @gene_stable_id_to_link ); + my @gene_biotype; + my ( @transcript_stable_id, @transcript_stable_id_to_link ); + my @transcript_biotype; + my ( @name, @name_to_link ); + my @description; + + if ($genebuild) { + foreach my $gene ( @{ $gene{$genebuild} } ) { + my ( $gene_stable_id, $name, $description, $gene_biotype, + $distance, $transcripts ) + = @{$gene}; + push @distance, $distance; + push @gene_stable_id, $gene_stable_id; + push @gene_stable_id_to_link, + [ $gene_stable_id, $gene_stable_id ]; + push @gene_biotype, $gene_biotype; + foreach my $transcript ( @{$transcripts} ) { + my ( $transcript_stable_id, $transcript_biotype ) = + @{$transcript}; + push @transcript_stable_id, $transcript_stable_id; + push @transcript_stable_id_to_link, + [ $transcript_stable_id, $transcript_stable_id ]; + push @transcript_biotype, $transcript_biotype; + } + push @name, $name; + push @name_to_link, [ $gene_stable_id, $name ]; + push @description, $description; + } + } + push @row, [ \@distance ]; + push @row, [ \@gene_stable_id, [@gene_stable_id_to_link] ]; + push @row, [ \@gene_biotype ]; + push @row, [ \@transcript_stable_id, [@transcript_stable_id_to_link] ]; + push @row, [ \@transcript_biotype ]; + push @row, [ \@name, [@name_to_link] ]; + push @row, [ \@description ]; + + # Counts and normalised counts + ## no critic (ProhibitMagicNumbers) + my @counts = map { [$_] } @{ $region->[9] }; + my @normalised_counts = map { [$_] } @{ $region->[10] }; + ## use critic + foreach my $count (@counts) { + push @row, [$count]; + } + foreach my $normalised_count (@normalised_counts) { + push @row, [$normalised_count]; + } + + # Condition fold changes + if ( scalar @conditions == 2 ) { + ## no critic (ProhibitMagicNumbers) + my ( $condition_fold_change, $log2_condition_fold_change ) = + @{ $region->[13] }; + ## use critic + push @row, [$log2_condition_fold_change]; + } + + # Group fold changes + if ( scalar @conditions == 2 && scalar @groups > 1 ) { + ## no critic (ProhibitMagicNumbers) + my @group_fold_changes = @{ $region->[14] }; + ## use critic + foreach my $group_fold_change (@group_fold_changes) { + my ( + $condition_group_fold_change, + $log2_condition_group_fold_change + ) = @{$group_fold_change}; + push @row, [$log2_condition_group_fold_change]; + } + } + + # Dump row in all required formats + my @levels = qw( all ); + if ( defined $padj + && $padj ne 'NA' + && $padj < $arg_ref->{analysis}->output_sig_level ) + { + push @levels, 'sig'; + } + dump_output( \@levels, $fh, $definition, \@row ); + } + + # End all output files and close filehandles + end_output($fh); + + return; +} + +=func sort_regions + + Usage : $regions = sort_regions( $regions ); + Purpose : Sort regions by p value then location + Returns : Arrayref (of regions) + Parameters : Arrayref of regions + Throws : No exceptions + Comments : None + +=cut + +sub sort_regions { + my ($regions) = @_; + + # Separate regions with no p value from rest + my @regions_with_pval; + my @regions_no_pval; + foreach my $region ( @{$regions} ) { + ## no critic (ProhibitMagicNumbers) + if ( defined $region->[11] && $region->[11] ne 'NA' ) { + ## use critic + push @regions_with_pval, $region; + } + else { + push @regions_no_pval, $region; + } + } + + # Sort by adjusted p value and p value then regions without p value + ## no critic (ProhibitMagicNumbers) + my @regions = + sort { $a->[12] <=> $b->[12] || $a->[11] <=> $b->[11] } + @regions_with_pval; + ## use critic + push @regions, @regions_no_pval; # Sorted by location already + + return \@regions; +} + +=func get_definition + + Usage : $definition = get_definition($genebuild_version, $samples, + $conditions, $groups); + Purpose : Return the definitions for all columns of the table + Returns : Arrayref (of column definitions) + Parameters : String (Ensembl species) + String (genebuild version) + Arrayref of samples + Arrayref of conditions + Arrayref of groups + Throws : No exceptions + Comments : None + +=cut + +sub get_definition { + my ( $species, $genebuild_version, $samples, $conditions, $groups ) = @_; + + # Ensembl links + my $loc_link = + $species + ? qq{%s} + : undef; + my $gene_link = + q{%s}; + + my @def; + + push @def, [ 'Chr', $STRING, $loc_link, ]; + push @def, [ 'Region start', $INT, $loc_link, ]; + push @def, [ 'Region end', $INT, $loc_link, ]; + push @def, [ q{3' end position}, $INT, $loc_link, ]; + push @def, [ q{3' end strand}, $INT, ]; + push @def, [ q{3' end read count}, $INT, ]; + push @def, [ 'p value', $FLOAT, ]; + push @def, [ 'Adjusted p value', $FLOAT, ]; + push @def, [ q{Distance to 3' end }, $INT, ]; + push @def, + [ $genebuild_version . ' Ensembl Gene ID', $STRING, $gene_link, ]; + push @def, [ 'Gene type', $STRING, ]; + push @def, + [ $genebuild_version . ' Ensembl Transcript ID', $STRING, $gene_link, ]; + push @def, [ 'Transcript type', $STRING, ]; + push @def, [ 'Gene name', $STRING, $gene_link, ]; + push @def, [ 'Gene description', $STRING, ]; + + foreach my $sample ( @{$samples} ) { + push @def, [ $sample->name . ' count', $INT ]; + } + foreach my $sample ( @{$samples} ) { + push @def, [ $sample->name . ' normalised count', $FLOAT ]; + } + + if ( scalar @{$conditions} == 2 ) { + my $heading = sprintf 'Log2 fold change (%s/%s)', $conditions->[0], + $conditions->[1]; + push @def, [ $heading, $FLOAT ]; + } + + if ( scalar @{$conditions} == 2 && scalar @{$groups} > 1 ) { + foreach my $group ( @{$groups} ) { + my $heading = sprintf 'Log2 fold change (%s/%s) for group %s', + $conditions->[0], $conditions->[1], $group; + push @def, [ $heading, $FLOAT ]; + } + } + + return \@def; +} + +=func begin_output + + Usage : my $fh = begin_output( $dir, $defintion ); + Purpose : Open filehandles and begin all output files + Returns : undef + Parameters : String (the directory) + Arrayref (the definition) + Throws : No exceptions + Comments : None + +=cut + +sub begin_output { + my ( $dir, $definition ) = @_; + + my %fh; + foreach my $format (@FORMATS) { + + # Level determines whether output all regions or just significant ones + foreach my $level (qw( all sig )) { + my $file = File::Spec->catfile( $dir, $level . q{.} . $format ); + open my $fh, '>', $file; ## no critic (RequireBriefOpen) + $fh{$format}{$level} = $fh; + my $begin_sub_name = 'begin_' . $format; + my $sub_ref = \&{$begin_sub_name}; + &{$sub_ref}( $fh, $definition ); + } + } + + return \%fh; +} + +=func end_output + + Usage : end_output( $fh ); + Purpose : End all output files and close filehandles + Returns : undef + Parameters : Hashref (of filehandles) + Throws : No exceptions + Comments : None + +=cut + +sub end_output { + my ($fh) = @_; + + foreach my $format (@FORMATS) { + foreach my $level (qw( all sig )) { + my $end_sub_name = 'end_' . $format; + my $sub_ref = \&{$end_sub_name}; + &{$sub_ref}( $fh->{$format}{$level} ); + close $fh->{$format}{$level}; + } + } + + return; +} + +=func dump_output + + Usage : dump_output( $levels, $fh, $definition, $row ); + Purpose : Dump row in all required formats at all required levels + Returns : undef + Parameters : Arrayref (of levels) + Hashref (of filehandles) + Arrayref (the definition) + Arrayref (of row data) + Throws : No exceptions + Comments : None + +=cut + +sub dump_output { + my ( $levels, $fh, $definition, $row ) = @_; + + foreach my $format (@FORMATS) { + foreach my $level ( @{$levels} ) { + my $dump_sub_name = 'dump_' . $format; + my $sub_ref = \&{$dump_sub_name}; + &{$sub_ref}( $fh->{$format}{$level}, $definition, $row ); + } + } + + return; +} + +=func begin_csv + + Usage : begin_csv( $fh, $defintion ); + Purpose : Begin CSV table + Returns : undef + Parameters : Filehandle + Arrayref (the definition) + Throws : No exceptions + Comments : None + +=cut + +sub begin_csv { + my ( $fh, $definition ) = @_; + + my @headings; + foreach my $column ( @{$definition} ) { + my ($heading) = @{$column}; + $heading =~ s/"/""/xmsg; + push @headings, q{"} . $heading . q{"}; + } + print {$fh} ( join q{,}, @headings ), "\r\n"; + + return; +} + +=func end_csv + + Usage : end_csv( $fh ); + Purpose : End CSV table + Returns : undef + Parameters : Filehandle + Throws : No exceptions + Comments : None + +=cut + +sub end_csv { + return; +} + +=func dump_csv + + Usage : dump_csv( $fh, $definition, $row ); + Purpose : Dump the data in a CSV table + Returns : undef + Parameters : Filehandle + Arrayref (the defintion) + Arrayref (the row data) + Throws : No exceptions + Comments : None + +=cut + +sub dump_csv { + my ( $fh, $definition, $row ) = @_; + + my @output_row; + my $i = 0; # Index to definition + foreach my $cell ( @{$row} ) { + my $type = $definition->[$i]->[1]; + my ($data) = @{$cell}; + + # Turn into a list of data, even if just one + if ( ref $data ne 'ARRAY' ) { + $data = [$data]; + } + + # Substitute default if undefined + my @output_cell; + foreach my $datum ( @{$data} ) { + $datum = defined $datum ? $datum : q{}; + push @output_cell, $datum; + } + + # Add default if necessary + if ( !@output_cell ) { + push @output_cell, q{}; + } + + my $output_cell = join q{,}, @output_cell; + + # Strings and lists need quoting + if ( $type == $STRING || scalar @output_cell > 1 ) { + $output_cell =~ s/"/""/xmsg; + $output_cell = q{"} . $output_cell . q{"}; + } + + push @output_row, $output_cell; + + $i++; + } + print {$fh} ( join q{,}, @output_row ), "\n"; + + return; +} + +=func begin_tsv + + Usage : begin_tsv( $fh, $defintion ); + Purpose : Begin TSV table + Returns : undef + Parameters : Filehandle + Arrayref (the definition) + Throws : No exceptions + Comments : None + +=cut + +sub begin_tsv { + my ( $fh, $definition ) = @_; + + my @headings = map { $_->[0] } @{$definition}; + print {$fh} q{#}, ( join "\t", @headings ), "\n"; + + return; +} + +=func end_tsv + + Usage : end_tsv( $fh ); + Purpose : End TSV table + Returns : undef + Parameters : Filehandle + Throws : No exceptions + Comments : None + +=cut + +sub end_tsv { + return; +} + +=func dump_tsv + + Usage : dump_tsv( $fh, $definition, $row ); + Purpose : Dump the data in a TSV table + Returns : undef + Parameters : Filehandle + Arrayref (the defintion) + Arrayref (the row data) + Throws : No exceptions + Comments : None + +=cut + +sub dump_tsv { + my ( $fh, $definition, $row ) = @_; + + my @output_row; + my $i = 0; # Index to definition + foreach my $cell ( @{$row} ) { + my $type = $definition->[$i]->[1]; + my ($data) = @{$cell}; + + # Turn into a list of data, even if just one + if ( ref $data ne 'ARRAY' ) { + $data = [$data]; + } + + # Substitute default if undefined + my @output_cell; + foreach my $datum ( @{$data} ) { + $datum = defined $datum && length $datum > 0 ? $datum : q{-}; + push @output_cell, $datum; + } + + # Add default if necessary + if ( !@output_cell ) { + push @output_cell, q{-}; + } + + push @output_row, ( join q{,}, @output_cell ); + + $i++; + } + print {$fh} ( join "\t", @output_row ), "\n"; + + return; +} + +=func begin_html + + Usage : begin_html( $fh, $defintion ); + Purpose : Begin HTML table + Returns : undef + Parameters : Filehandle + Arrayref (the definition) + Throws : No exceptions + Comments : None + +=cut + +sub begin_html { + my ( $fh, $definition ) = @_; + + print {$fh} <<'HTML'; + + + + DETCT + + + + + + +HTML + + foreach my $column ( @{$definition} ) { + my ($heading) = @{$column}; + print {$fh} '', "\n"; + } + + print {$fh} <<'HTML'; + + + +HTML + + return; +} + +=func end_html + + Usage : end_html( $fh ); + Purpose : End HTML table + Returns : undef + Parameters : Filehandle + Throws : No exceptions + Comments : None + +=cut + +sub end_html { + my ($fh) = @_; + + print {$fh} <<'HTML'; + +
', $heading, '
+ + +HTML + + return; +} + +=func dump_html + + Usage : dump_html( $fh, $definition, $row ); + Purpose : Dump the data in an HTML table + Returns : undef + Parameters : Filehandle + Arrayref (the defintion) + Arrayref (the row data) + Throws : No exceptions + Comments : None + +=cut + +sub dump_html { + my ( $fh, $definition, $row ) = @_; + + print {$fh} '', "\n"; + + my $i = 0; # Index to definition + foreach my $cell ( @{$row} ) { + my ( undef, $type, $link ) = @{ $definition->[$i] }; + my ( $data, $data_to_link ) = @{$cell}; + + # Turn into a list of data, even if just one + if ( ref $data ne 'ARRAY' ) { + $data = [$data]; + $data_to_link = [$data_to_link]; + } + + print {$fh} ''; + + my @data; + my $j = 0; # Index to each item when multiple items in one table cell + foreach my $datum ( @{$data} ) { + my $datum_to_link = $data_to_link->[$j]; + + $datum = defined $datum ? $datum : q{}; + + # Make a link if there's a link and all data for the link is defined + if ( $link && $datum_to_link && all { defined $_ } + @{$datum_to_link} ) + { + $datum = sprintf $link, @{$datum_to_link}; + } + + push @data, $datum; + + $j++; + } + + print {$fh} join '
', @data; + + print {$fh} '', "\n"; + + $i++; + } + + print {$fh} '', "\n"; + + return; +} + +1; diff --git a/lib/DETCT/Misc/PeakHMM.pm b/lib/DETCT/Misc/PeakHMM.pm new file mode 100644 index 0000000..24d3ed9 --- /dev/null +++ b/lib/DETCT/Misc/PeakHMM.pm @@ -0,0 +1,520 @@ +## no critic (RequireUseStrict, RequireUseWarnings, RequireTidyCode) +package DETCT::Misc::PeakHMM; +## use critic + +# ABSTRACT: Miscellaneous functions for running peaks HMM + +## Author : is1 +## Maintainer : is1 +## Created : 2012-10-29 +## Last commit by : $Author$ +## Last modified : $Date$ +## Revision : $Revision$ +## Repository URL : $HeadURL$ + +use warnings; +use strict; +use autodie; +use Carp; +use Try::Tiny; + +use English qw( -no_match_vars ); +use POSIX qw( WIFEXITED); +use File::Slurp; +use File::Spec; +use File::Path qw( make_path ); +use Memoize qw( memoize flush_cache ); + +use base qw( Exporter ); +our @EXPORT_OK = qw( + merge_read_peaks + summarise_read_peaks + run_peak_hmm + join_hmm_bins +); + +=head1 SYNOPSIS + + # Brief code examples + +=cut + +=func merge_read_peaks + + Usage : my $peaks_ref = DETCT::Misc::PeakHMM::merge_read_peaks( { + peak_buffer_width => 100, + seq_name => '1', + peaks => $peaks_ary_ref, + } ); + Purpose : Merge read peaks (overlapping reads) + Returns : Hashref { + String (sequence name) => Arrayref [ + Arrayref [ + Int (peak start), + Int (peak end), + Int (peak read count) + ], + ... (peaks) + ] + } + Parameters : Hashref { + peak_buffer_width => Int (the peak buffer size), + seq_name => String (the sequence name), + peaks => Arrayref (of peaks), + } + Throws : If peak buffer width is missing + If sequence name is missing + If peaks are missing + Comments : None + +=cut + +sub merge_read_peaks { + my ($arg_ref) = @_; + + confess 'No peak buffer width specified' + if !defined $arg_ref->{peak_buffer_width}; + confess 'No sequence name specified' if !defined $arg_ref->{seq_name}; + confess 'No peaks specified' if !defined $arg_ref->{peaks}; + + my $peaks_ref = $arg_ref->{peaks}; + + # Sort peaks by start then end + @{$peaks_ref} = + sort { $a->[0] <=> $b->[0] || $a->[1] <=> $b->[1] } @{$peaks_ref}; + + # Peak variables + my @merged_peaks; + my $current_merged_peak_read_count; + my $current_merged_peak_start; + my $current_merged_peak_end; + + # Merge peaks + foreach my $peak ( @{$peaks_ref} ) { + my ( $peak_start, $peak_end, $peak_read_count ) = @{$peak}; + + # We're starting the first merged peak + if ( !defined $current_merged_peak_start ) { + $current_merged_peak_start = $peak_start; + $current_merged_peak_end = $peak_end; + $current_merged_peak_read_count = $peak_read_count; + next; + } + + # Extend or finish current merged peak? + if ( $peak_start - $current_merged_peak_end < + $arg_ref->{peak_buffer_width} ) + { + # Extend current merged peak + $current_merged_peak_end = $peak_end; + $current_merged_peak_read_count += $peak_read_count; + } + else { + # Finish current merged peak + push @merged_peaks, + [ + $current_merged_peak_start, $current_merged_peak_end, + $current_merged_peak_read_count + ]; + + # Start new merged peak + $current_merged_peak_start = $peak_start; + $current_merged_peak_end = $peak_end; + $current_merged_peak_read_count = $peak_read_count; + } + } + + # Finish last merged peak + if ($current_merged_peak_read_count) { + push @merged_peaks, + [ + $current_merged_peak_start, $current_merged_peak_end, + $current_merged_peak_read_count + ]; + } + + return { $arg_ref->{seq_name} => \@merged_peaks }; +} + +=func summarise_read_peaks + + Usage : my $summary_ref = DETCT::Misc::PeakHMM::summarise_read_peaks( { + bin_size => 100, + peak_buffer_width => 100, + hmm_sig_level => 0.001, + seq_name => '1', + seq_bp => 10_000_000, + read_length => 54, + peaks => $peaks_ary_ref, + } ); + Purpose : Summarise read peak distribution for HMM + Returns : Hashref { + String (sequence name) => Hashref { + total_read_count_per_mb => Float, + total_sig_read_count_per_mb => Float, + total_sig_peak_width_in_mb => Float, + median_sig_peak_width => Int, + total_sig_peaks => Int, + peak_buffer_width => Int, + read_threshold => Int, + bin_size => Int, + num_bins => Int, + } + } + Parameters : Hashref { + bin_size => Int (the bin size), + peak_buffer_width => Int (the peak buffer size), + hmm_sig_level => Float (the HMM significance level), + seq_name => String (the sequence name), + seq_bp => Int (the sequence bp), + read_length => Int (the read length), + peaks => Arrayref (of peaks), + } + Throws : If bin size is missing + If peak buffer width is missing + If HMM significance level is missing + If sequence name is missing + If sequence bp is missing + if read length is missing + If peaks are missing + Comments : Source of logic is summary.pl from + http://www.sph.umich.edu/csg/qin/HPeak/ + +=cut + +sub summarise_read_peaks { + my ($arg_ref) = @_; + + confess 'No bin size specified' if !defined $arg_ref->{bin_size}; + confess 'No peak buffer width specified' + if !defined $arg_ref->{peak_buffer_width}; + confess 'No HMM significance level specified' + if !defined $arg_ref->{hmm_sig_level}; + confess 'No sequence name specified' if !defined $arg_ref->{seq_name}; + confess 'No sequence bp specified' if !defined $arg_ref->{seq_bp}; + confess 'No read length specified' if !defined $arg_ref->{read_length}; + confess 'No peaks specified' if !defined $arg_ref->{peaks}; + + my $total_peaks = scalar @{ $arg_ref->{peaks} }; + + if ( !$total_peaks ) { + + # No peaks so won't be running HMM + return { $arg_ref->{seq_name} => {} }; + } + + # Get total read count + my $total_read_count = 0; + foreach my $peak ( @{ $arg_ref->{peaks} } ) { + my ( $start, $end, $read_count ) = @{$peak}; + $total_read_count += $read_count; + } + + # Get avg reads/bp + my $avg_reads_per_bp = $total_read_count / $arg_ref->{seq_bp}; + + # Identify significant peaks + memoize('_calc_log_sum'); + my @sig_peak_widths; + my $total_sig_read_count = 0; + my $total_sig_peak_width = 0; + foreach my $peak ( @{ $arg_ref->{peaks} } ) { + my ( $start, $end, $read_count ) = @{$peak}; + my $width = $end - $start + 1; + my $avg_reads = $avg_reads_per_bp * $width; + my $log_avg_reads = log $avg_reads; + my $exp_avg_reads = exp $avg_reads; + + # Gather info for significant peaks + my $sum = 1; + my $i = 1; + while ( $i < $read_count ) { + $sum += exp _calc_log_sum( $i, $log_avg_reads ); + last if $sum >= $exp_avg_reads; + $i++; + } + my $prob = 1 - exp( -$avg_reads ) * $sum; + if ( $prob < $arg_ref->{hmm_sig_level} / $total_peaks ) { + push @sig_peak_widths, $width; + $total_sig_read_count += $read_count; + $total_sig_peak_width += $width; + } + + # Expire Memoize cache for each peak + flush_cache('_calc_log_sum'); + } + + # Calculate hit threshold + my $proportion_bp_in_peaks = + $total_read_count * $arg_ref->{read_length} / $arg_ref->{seq_bp}; + my $read_threshold = 0; + my $prob = 1; + my $sum = 1; + while ( $prob > $arg_ref->{hmm_sig_level} / $total_peaks ) { + $read_threshold++; + my $log_sum = 0; + foreach my $i ( 1 .. $read_threshold ) { + $log_sum += log($proportion_bp_in_peaks) - log $i; + } + $sum += exp $log_sum; + $prob = 1 - exp( -$proportion_bp_in_peaks ) * $sum; + } + + # Sort widths and get median + my $total_sig_peaks = scalar @sig_peak_widths; + @sig_peak_widths = sort { $a <=> $b } @sig_peak_widths; + my $median_sig_peak_width = $sig_peak_widths[ int( $total_sig_peaks / 2 ) ]; + + ## no critic (ProhibitMagicNumbers) + my $num_bins = int( $arg_ref->{seq_bp} / $arg_ref->{bin_size} + 0.5 ); + ## use critic + + ## no critic (ProhibitMagicNumbers) + my %summary = ( + total_read_count_per_mb => $total_read_count / 1_000_000, + total_sig_read_count_per_mb => $total_sig_read_count / 1_000_000, + total_sig_peak_width_in_mb => $total_sig_peak_width / 1_000_000, + median_sig_peak_width => $median_sig_peak_width || 0, + total_sig_peaks => $total_sig_peaks, + peak_buffer_width => $arg_ref->{peak_buffer_width}, + read_threshold => $read_threshold, + bin_size => $arg_ref->{bin_size}, + num_bins => $num_bins, + ); + ## use critic + + return { $arg_ref->{seq_name} => \%summary }; +} + +# Calculate log sum +sub _calc_log_sum { + my ( $i, $log ) = @_; + + if ( $i == 0 ) { + return 0; + } + else { + return $log - log($i) + _calc_log_sum( $i - 1, $log ); + } +} + +=func run_peak_hmm + + Usage : my $hmm_ref = DETCT::Misc::PeakHMM::run_peak_hmm( { + dir => '.', + hmm_sig_level => 0.001, + seq_name => '1', + read_bins => $read_bins_hash_ref, + summary => $summary_hash_ref, + hmm_binary => 'bin/quince_chiphmmnew', + } ); + Purpose : Run peak HMM + Returns : Hashref { + String (sequence name) => Arrayref [ + Arrayref [ + Int (bin), + Int (read count), + Float (log probability), + ], + ... (peaks) + } + Parameters : Hashref { + dir => String (the working directory), + hmm_sig_level => Float (the HMM significance level), + seq_name => String (the sequence name), + read_bins => Hashref (of read bins), + summary => Hashref (of summary), + hmm_binary => String (the HMM binary) + } + Throws : If directory is missing + If HMM significance level is missing + If sequence name is missing + If read bins are missing + If summary is missing + If HMM binary is missing + If command line can't be run + Comments : None + +=cut + +sub run_peak_hmm { + my ($arg_ref) = @_; + + confess 'No directory specified' if !defined $arg_ref->{dir}; + confess 'No HMM significance level specified' + if !defined $arg_ref->{hmm_sig_level}; + confess 'No sequence name specified' if !defined $arg_ref->{seq_name}; + confess 'No read bins specified' if !defined $arg_ref->{read_bins}; + confess 'No summary specified' if !defined $arg_ref->{summary}; + confess 'No HMM binary specified' if !defined $arg_ref->{hmm_binary}; + + if ( !scalar keys %{ $arg_ref->{summary} } ) { + + # No summary (i.e. no peaks), so won't run HMM + return { $arg_ref->{seq_name} => [] }; + } + + # Make sure working directory exists + if ( !-d $arg_ref->{dir} ) { + make_path( $arg_ref->{dir} ); + } + + # Sanitise sequence name for using in filenames + my $safe_seq_name = $arg_ref->{seq_name}; + $safe_seq_name =~ s/\W+//xmsg; + + # Write read bins to file + my $bin_file = + File::Spec->catfile( $arg_ref->{dir}, $safe_seq_name . '.bins' ); + open my $bin_fh, '>', $bin_file; + foreach my $bin ( sort { $a <=> $b } keys %{ $arg_ref->{read_bins} } ) { + print {$bin_fh} $bin, "\t", $arg_ref->{read_bins}->{$bin}, "\n"; + } + close $bin_fh; + + # Write summary to file + my $sum_file = + File::Spec->catfile( $arg_ref->{dir}, $safe_seq_name . '.params' ); + ## no critic (RequireBriefOpen) + open my $sum_fh, '>', $sum_file; + print {$sum_fh} $arg_ref->{summary}->{total_read_count_per_mb}, "\n"; + print {$sum_fh} $arg_ref->{summary}->{total_sig_read_count_per_mb}, "\n"; + print {$sum_fh} $arg_ref->{summary}->{total_sig_peak_width_in_mb}, "\n"; + print {$sum_fh} $arg_ref->{summary}->{median_sig_peak_width}, "\n"; + print {$sum_fh} $arg_ref->{summary}->{total_sig_peaks}, "\n"; + print {$sum_fh} $arg_ref->{summary}->{peak_buffer_width}, "\n"; + print {$sum_fh} $arg_ref->{summary}->{read_threshold}, "\n"; + print {$sum_fh} $arg_ref->{summary}->{bin_size}, "\n"; + print {$sum_fh} $arg_ref->{summary}->{num_bins}, "\n"; + close $sum_fh; + ## use critic + + my $hmm_file = + File::Spec->catfile( $arg_ref->{dir}, $safe_seq_name . '.hmm' ); + my $stdout_file = + File::Spec->catfile( $arg_ref->{dir}, $safe_seq_name . '.o' ); + my $stderr_file = + File::Spec->catfile( $arg_ref->{dir}, $safe_seq_name . '.e' ); + + my $cmd = join q{ }, $arg_ref->{hmm_binary}, $bin_file, $sum_file, + $hmm_file; + $cmd .= ' 1>' . $stdout_file; + $cmd .= ' 2>' . $stderr_file; + WIFEXITED( system $cmd) or confess "Couldn't run $cmd ($OS_ERROR)"; + + # Reformat output into array of arrayrefs + my $log_hmm_sig_level = log $arg_ref->{hmm_sig_level}; + my @hmm_output = (); + if ( -r $hmm_file ) { # Peak HMM can fail + foreach my $line ( read_file($hmm_file) ) { + chomp $line; + my ( $bin, undef, undef, $read_count, $log_prob ) = split /\t/xms, + $line; + next if $log_prob >= $log_hmm_sig_level; + push @hmm_output, [ $bin, $read_count, $log_prob ]; + } + } + + return { $arg_ref->{seq_name} => \@hmm_output }; +} + +=func join_hmm_bins + + Usage : my $regions_ref = DETCT::Misc::PeakHMM::join_hmm_bins( { + bin_size => 100, + seq_name => '1', + hmm_bins => $hmm_bins_ary_ref, + } ); + Purpose : Join reads bins output by peak HMM into regions + Returns : Hashref { + String (sequence name) => Arrayref [ + Arrayref [ + Int (region start), + Int (region end), + Int (region maximum read count), + Float (region log probability sum), + ], + ... (regions) + } + Parameters : Hashref { + bin_size => Int (the bin size), + seq_name => String (the sequence name), + hmm_bins => Arrayref (of HMM bins), + } + Throws : If bin size is missing + If sequence name is missing + If HMM bins are missing + Comments : None + +=cut + +sub join_hmm_bins { + my ($arg_ref) = @_; + + confess 'No bin size specified' if !defined $arg_ref->{bin_size}; + confess 'No sequence name specified' if !defined $arg_ref->{seq_name}; + confess 'No HMM bins specified' if !defined $arg_ref->{hmm_bins}; + + my @regions; + + # Region variables (where a region is a set of merged bins) + my $region_bin_start; + my $region_bin_end; + my $region_max_read_count; + my $region_log_prob_sum; + + foreach my $hmm_bin ( @{ $arg_ref->{hmm_bins} } ) { + my ( $bin, $read_count, $log_prob ) = @{$hmm_bin}; + + # We're starting the first region + if ( !defined $region_bin_start ) { + $region_bin_start = $bin; + $region_bin_end = $bin; + $region_max_read_count = $read_count; + $region_log_prob_sum = $log_prob; + next; + } + + # Extend or finish current region? + if ( $bin == $region_bin_end + 1 ) { + + # Next bin, so extend current region + $region_bin_end = $bin; + if ( $read_count > $region_max_read_count ) { + $region_max_read_count = $read_count; + } + $region_log_prob_sum += $log_prob; + } + else { + # Finish current region and convert to genomic coordinates + push @regions, + [ + $region_bin_start * $arg_ref->{bin_size} + 1, + ( $region_bin_end + 1 ) * $arg_ref->{bin_size}, + $region_max_read_count, + $region_log_prob_sum, + ]; + + # Start new region + $region_bin_start = $bin; + $region_bin_end = $bin; + $region_max_read_count = $read_count; + $region_log_prob_sum = $log_prob; + } + } + + # Finish last region + if ( defined $region_bin_start ) { + push @regions, + [ + $region_bin_start * $arg_ref->{bin_size} + 1, + ( $region_bin_end + 1 ) * $arg_ref->{bin_size}, + $region_max_read_count, + $region_log_prob_sum, + ]; + } + + return { $arg_ref->{seq_name} => \@regions }; +} + +1; diff --git a/lib/DETCT/Misc/R.pm b/lib/DETCT/Misc/R.pm new file mode 100644 index 0000000..64b5cea --- /dev/null +++ b/lib/DETCT/Misc/R.pm @@ -0,0 +1,271 @@ +## no critic (RequireUseStrict, RequireUseWarnings, RequireTidyCode) +package DETCT::Misc::R; +## use critic + +# ABSTRACT: Miscellaneous functions for running R + +## Author : is1 +## Maintainer : is1 +## Created : 2012-11-21 +## Last commit by : $Author$ +## Last modified : $Date$ +## Revision : $Revision$ +## Repository URL : $HeadURL$ + +use warnings; +use strict; +use autodie; +use Carp; +use Try::Tiny; + +use English qw( -no_match_vars ); +use POSIX qw( WIFEXITED); +use File::Slurp; +use File::Spec; +use File::Path qw( make_path ); +use Sort::Naturally; +use List::Util qw( sum ); +use List::MoreUtils qw( uniq ); + +use base qw( Exporter ); +our @EXPORT_OK = qw( + run_deseq +); + +=head1 SYNOPSIS + + # Brief code examples + +=cut + +=func run_deseq + + Usage : my $regions_ref = DETCT::Misc::R::run_deseq( { + dir => '.', + regions => $regions_hash_ref, + samples => $samples_ary_ref, + r_binary => 'R', + deseq_script => 'bin/run_deseq.R', + } ); + Purpose : Run DESeq + Returns : Arrayref [ + Arrayref [ + String (region sequence name), + Int (region start), + Int (region end), + Int (region maximum read count), + Float (region log probability sum), + String (3' end sequence name) or undef, + Int (3' end position) or undef, + Int (3' end strand) or undef, + Int (3' end read count) or undef, + Arrayref [ + Int (count) + ... + ], + Arrayref [ + Int (normalised count) + ... + ], + Int (p value) or undef, + Int (adjusted p value) or undef, + Arrayref [ + Int (condition fold change) or undef, + Int (log2 condition fold change) or undef, + ], + Arrayref [ + Arrayref [ + Int (group fold change) or undef, + Int (log2 group fold change) or undef, + ], + ... (groups) + ] + ], + ... (regions) + } + Parameters : Hashref { + dir => String (the working directory), + regions => Hashref (of arrayrefs of regions), + samples => Arrayref (of samples) + r_binary => String (the R binary), + deseq_script => String (the DESeq script), + } + Throws : If directory is missing + If regions are missing + If samples are missing + If R binary is missing + If DESeq script is missing + If command line can't be run + Comments : None + +=cut + +sub run_deseq { + my ($arg_ref) = @_; + + confess 'No directory specified' if !defined $arg_ref->{dir}; + confess 'No regions specified' if !defined $arg_ref->{regions}; + confess 'No samples specified' if !defined $arg_ref->{samples}; + confess 'No R binary specified' if !defined $arg_ref->{r_binary}; + confess 'No DESeq script specified' if !defined $arg_ref->{deseq_script}; + + # Get conditions and groups + my @samples = @{ $arg_ref->{samples} }; + my @conditions = uniq( nsort( map { $_->condition } @samples ) ); + my @groups = uniq( nsort( map { $_->group } @samples ) ); + @groups = grep { defined $_ } @groups; + + # Make sure working directory exists + if ( !-d $arg_ref->{dir} ) { + make_path( $arg_ref->{dir} ); + } + + # Write regions to input file + my $input_file = File::Spec->catfile( $arg_ref->{dir}, 'input.txt' ); + my @sample_names = map { $_->name } @samples; + open my $input_fh, '>', $input_file; + print {$input_fh} ( join "\t", q{}, @sample_names ), "\n"; + foreach my $seq_name ( nsort( keys %{ $arg_ref->{regions} } ) ) { + foreach my $region ( @{ $arg_ref->{regions}->{$seq_name} } ) { + my $counts = $region->[-1]; + my $region_text = join q{:}, $seq_name, $region->[0], $region->[1]; + print {$input_fh} ( join "\t", $region_text, @{$counts} ), "\n"; + } + } + close $input_fh; + + # Write samples to input file + my $samples_file = File::Spec->catfile( $arg_ref->{dir}, 'samples.txt' ); + my $last_col_to_print = @groups > 1 ? 2 : 1; + my @header = ( q{}, 'condition', 'group' )[ 0 .. $last_col_to_print ]; + open my $samples_fh, '>', $samples_file; + print {$samples_fh} ( join "\t", @header ), "\n"; + foreach my $sample (@samples) { + my @row = + ( $sample->name, $sample->condition, $sample->group ) + [ 0 .. $last_col_to_print ]; + print {$samples_fh} ( join "\t", @row ), "\n"; + } + close $samples_fh; + + my $output_file = File::Spec->catfile( $arg_ref->{dir}, 'output.txt' ); + my $size_factors_file = + File::Spec->catfile( $arg_ref->{dir}, 'size_factors.txt' ); + my $qc_pdf_file = File::Spec->catfile( $arg_ref->{dir}, 'qc.pdf' ); + my $stdout_file = File::Spec->catfile( $arg_ref->{dir}, 'deseq.o' ); + my $stderr_file = File::Spec->catfile( $arg_ref->{dir}, 'deseq.e' ); + + my $cmd = join q{ }, $arg_ref->{r_binary}, '--slave', '--args', + $input_file, $samples_file, $output_file, $size_factors_file, + $qc_pdf_file, '<', $arg_ref->{deseq_script}; + $cmd .= ' 1>' . $stdout_file; + $cmd .= ' 2>' . $stderr_file; + WIFEXITED( system $cmd) or confess "Couldn't run $cmd ($OS_ERROR)"; + + # Get size factors for each sample + my @size_factors = read_file($size_factors_file); + chomp @size_factors; + + # Get output + my %pval_for; + my %padj_for; + foreach my $line ( read_file($output_file) ) { + chomp $line; + my ( $region_text, $pval, $padj ) = split /\t/xms, $line; + $pval_for{$region_text} = $pval; + $padj_for{$region_text} = $padj; + } + + # Reformat output into array of arrayrefs + my @output; + foreach my $seq_name ( nsort( keys %{ $arg_ref->{regions} } ) ) { + foreach my $region ( @{ $arg_ref->{regions}->{$seq_name} } ) { + my $region_text = join q{:}, $seq_name, $region->[0], $region->[1]; + my $counts = $region->[-1]; + + # Add sequence name to region + unshift @{$region}, $seq_name; + + # Normalise counts and store for fold change calculation + my @normalised_counts; + my %counts_for_condition; + my %counts_for_group_condition; + my $sample_index = 0; + foreach my $sample (@samples) { + my $normalised_count = + $counts->[$sample_index] / $size_factors[$sample_index]; + push @normalised_counts, $normalised_count; + push @{ $counts_for_condition{ $sample->condition } }, + $normalised_count; + push @{ $counts_for_group_condition{ $sample->group } + { $sample->condition } }, $normalised_count; + $sample_index++; + } + push @{$region}, \@normalised_counts; + + # Add p value and adjusted p value + push @{$region}, $pval_for{$region_text}, $padj_for{$region_text}; + + # Calculate fold change if two conditions + my $fold_change; + my $log2_fold_change; + if ( scalar @conditions == 2 ) { + ( $fold_change, $log2_fold_change ) = calc_fold_change( + $counts_for_condition{ $conditions[0] }, + $counts_for_condition{ $conditions[1] } + ); + } + push @{$region}, [ $fold_change, $log2_fold_change ]; + + # Calculate fold change for each group if two conditions + my @group_fold_changes; + if ( scalar @conditions == 2 && scalar @groups > 1 ) { + foreach my $group (@groups) { + my ( $group_fold_change, $group_log2_fold_change ) = + calc_fold_change( + $counts_for_group_condition{$group}{ $conditions[0] }, + $counts_for_group_condition{$group}{ $conditions[1] } + ); + push @group_fold_changes, + [ $group_fold_change, $group_log2_fold_change ]; + } + } + push @{$region}, \@group_fold_changes; + + push @output, $region; + } + } + + return \@output; +} + +=func calc_fold_change + + Usage : ($fold_change, $log2_fold_change) + = calc_fold_change(\@array1, \@array2); + Purpose : Calculate the fold change in mean value of two arrays + Returns : Int (fold change) + Int (log2 fold change) + Parameters : Arrayref + Arrayref + Throws : No exceptions + Comments : None + +=cut + +sub calc_fold_change { + my ( $array1_ref, $array2_ref ) = @_; + + my $fold_change; + my $log2_fold_change; + my $mean1 = sum( @{$array1_ref} ) / scalar @{$array1_ref}; + my $mean2 = sum( @{$array2_ref} ) / scalar @{$array2_ref}; + if ( $mean1 && $mean2 ) { + $fold_change = $mean1 / $mean2; # e.g. mutant / sibling + $log2_fold_change = log($fold_change) / log 2; + } + + return $fold_change, $log2_fold_change; +} + +1; diff --git a/lib/DETCT/Misc/Tag.pm b/lib/DETCT/Misc/Tag.pm new file mode 100644 index 0000000..4c507e2 --- /dev/null +++ b/lib/DETCT/Misc/Tag.pm @@ -0,0 +1,269 @@ +## no critic (RequireUseStrict, RequireUseWarnings, RequireTidyCode) +package DETCT::Misc::Tag; +## use critic + +# ABSTRACT: Miscellaneous functions for interacting with DETCT read tags + +## Author : is1 +## Maintainer : is1 +## Created : 2013-01-07 +## Last commit by : $Author$ +## Last modified : $Date$ +## Revision : $Revision$ +## Repository URL : $HeadURL$ + +use warnings; +use strict; +use autodie; +use Carp; +use Try::Tiny; + +use base qw( Exporter ); +our @EXPORT_OK = qw( + detag_trim_fastq + convert_tag_to_regexp +); + +=head1 SYNOPSIS + + # Brief code examples + +=cut + +=func detag_trim_fastq + + Usage : DETCT::Misc::Tag::detag_trim_fastq( { + fastq_read1_input => $fastq_read1_input, + fastq_read2_input => $fastq_read2_input, + fastq_output_prefix => $fastq_output_prefix, + pre_detag_trim_length => $pre_detag_trim_length, + polyt_trim_length => $polyt_trim_length, + polyt_min_length => $polyt_min_length, + read_tags => \@read_tags, + } ); + Purpose : Detag and trim FASTQ files + Returns : undef + Parameters : Hashref { + fastq_read1_input => String (read 1 FASTQ file), + fastq_read2_input => String (read 2 FASTQ file), + fastq_output_prefix => String (prefix for output FASTQs), + pre_detag_trim_length => Int (length to trim reads to), + polyt_trim_length => Int (polyT length to be trimmed), + polyt_min_length => Int (min Ts to define polyT), + read_tags => Arrayref (of read tags), + no_pair_suffix => Boolean or undef, + } + Throws : No exceptions + Comments : None + +=cut + +sub detag_trim_fastq { + my ($arg_ref) = @_; + + # Assume all tags are same length + my $tag_length = length $arg_ref->{read_tags}[0]; + + my $min_polyt = q{T} x $arg_ref->{polyt_min_length}; + my $polyt_re = qr/$min_polyt/xms; # Regexp for polyT matching + + my $pre_detag_trim_length = $arg_ref->{pre_detag_trim_length}; + my $polyt_trim_length = $arg_ref->{polyt_trim_length}; + + # Convert tags to regular expressions + my @read_tags = @{ $arg_ref->{read_tags} }; + my %re_tag_for = convert_tag_to_regexp(@read_tags); + + ## no critic (RequireBriefOpen) + open my $fh1_in, '<', $arg_ref->{fastq_read1_input}; + open my $fh2_in, '<', $arg_ref->{fastq_read2_input}; + ## use critic + my $fh_out_for = _open_output_fhs( $arg_ref->{fastq_output_prefix}, + $tag_length, @read_tags ); + + while ( my $read1_id = <$fh1_in> ) { + my $read2_id = <$fh2_in>; + my $read1_seq = <$fh1_in>; + my $read2_seq = <$fh2_in>; + my $read1_plus = <$fh1_in>; + my $read2_plus = <$fh2_in>; + my $read1_qual = <$fh1_in>; + my $read2_qual = <$fh2_in>; + + chomp $read1_id; + chomp $read2_id; + chomp $read1_seq; + chomp $read2_seq; + chomp $read1_plus; + chomp $read2_plus; + chomp $read1_qual; + chomp $read2_qual; + + # Do we need to add pair suffix to read IDs? + if ( $arg_ref->{no_pair_suffix} ) { + $read1_id .= '/1'; + $read2_id .= '/2'; + } + + # Remove /1 or /2 from read ids and then check they match + my $read1_id_no_suffix = $read1_id; + my $read2_id_no_suffix = $read2_id; + ## no critic (ProhibitMagicNumbers) + substr $read1_id_no_suffix, -2, 2, q{}; + substr $read2_id_no_suffix, -2, 2, q{}; + ## use critic + if ( $read1_id_no_suffix ne $read2_id_no_suffix ) { + confess 'Read order does not match in input ' + . "($read1_id_no_suffix does not match $read2_id_no_suffix)"; + } + + # Trim reads to specified length if necessary + if ( length $read1_seq > $pre_detag_trim_length ) { + $read1_seq = substr $read1_seq, 0, $pre_detag_trim_length; + $read2_seq = substr $read2_seq, 0, $pre_detag_trim_length; + $read1_qual = substr $read1_qual, 0, $pre_detag_trim_length; + $read2_qual = substr $read2_qual, 0, $pre_detag_trim_length; + } + + # Get tag and putative polyT from read 1 + my $tag_in_read = substr $read1_seq, 0, $tag_length; + my $polyt_seq = substr $read1_seq, $tag_length, $polyt_trim_length; + + # Default tag to add to id if no match + my $tag_for_id = q{X} x $tag_length; + my $tag_found = q{X} x $tag_length; + + # Make sure a tag matches and polyT is present + TAG: foreach my $tag ( sort keys %re_tag_for ) { + my $regexps = $re_tag_for{$tag}; + foreach my $re ( @{$regexps} ) { + if ( $tag_in_read =~ $re && $polyt_seq =~ $polyt_re ) { + $tag_for_id = $tag_in_read; + $tag_found = $tag; + substr $read1_seq, 0, $tag_length + $polyt_trim_length, q{}; + substr $read1_qual, 0, $tag_length + $polyt_trim_length, + q{}; + last TAG; # Skip rest if got a match + } + } + } + + # Add tag to id + $read1_id =~ s{ /1 \z}{#$tag_for_id/1}xms; + $read2_id =~ s{ /2 \z}{#$tag_for_id/2}xms; + + print { $fh_out_for->{$tag_found}->{1} } $read1_id, "\n"; + print { $fh_out_for->{$tag_found}->{1} } $read1_seq, "\n"; + print { $fh_out_for->{$tag_found}->{1} } $read1_plus, "\n"; + print { $fh_out_for->{$tag_found}->{1} } $read1_qual, "\n"; + print { $fh_out_for->{$tag_found}->{2} } $read2_id, "\n"; + print { $fh_out_for->{$tag_found}->{2} } $read2_seq, "\n"; + print { $fh_out_for->{$tag_found}->{2} } $read2_plus, "\n"; + print { $fh_out_for->{$tag_found}->{2} } $read2_qual, "\n"; + } + + close $fh1_in; + close $fh2_in; + _close_output_fhs($fh_out_for); + + return; +} + +# Usage : my $fh_out_for = _open_output_fhs( +# $fastq_output_prefix, $tag_length, @read_tags +# ); +# Purpose : Open filehandles for all output FASTQ files +# Returns : Hashref of hashref of filehandles +# Parameters : String (prefix for output FASTQs) +# Int (tag length) +# Array of strings (the tags) +# Throws : No exceptions +# Comments : None +sub _open_output_fhs { + my ( $fastq_output_prefix, $tag_length, @tags ) = @_; + + push @tags, q{X} x $tag_length; # Default tag if no match + + my %fh_for; + foreach my $tag (@tags) { + foreach my $read ( 1, 2 ) { + my $filename = join q{_}, $fastq_output_prefix, $tag, $read; + $filename .= '.fastq'; + open my $fh, '>', $filename; ## no critic (RequireBriefOpen) + $fh_for{$tag}->{$read} = $fh; + } + } + + return \%fh_for; +} + +# Usage : _close_output_fhs($fh_out_for); +# Purpose : Close filehandles for all output FASTQ files +# Returns : undef +# Parameters : Hashref of hashref of filehandles +# Throws : No exceptions +# Comments : None +sub _close_output_fhs { + my ($fh_for) = @_; + + foreach my $tag ( keys %{$fh_for} ) { + foreach my $read ( keys %{ $fh_for->{$tag} } ) { + close $fh_for->{$tag}->{$read}; + } + } + + return; +} + +=func convert_tag_to_regexp + + Usage : %re_for = convert_tag_to_regexp( 'NNNNBGAGGC', 'NNNNBAGAAG' ); + Purpose : Convert tags to regular expressions for matching + Returns : Hash ( + String (tag) => Arrayref (of Regexps) + ) + Parameters : Array of strings (the tags) + Throws : No exceptions + Comments : None + +=cut + +sub convert_tag_to_regexp { + my @tags = @_; + + my %re_for; + foreach my $tag (@tags) { + my @mismatch_tags = ($tag); # Start with tag without mismatches + + # Add tag with each possible mismatch + foreach my $i ( 0 .. length($tag) - 1 ) { + my $mismatch_tag = $tag; + my $base = substr $mismatch_tag, $i, 1, q{N}; # Replace with N + if ( $base ne q{N} ) { + + # Not completely random base already + push @mismatch_tags, $mismatch_tag; + } + } + + # Convert IUPAC codes to AGCT (or N) + foreach my $re (@mismatch_tags) { + $re =~ s/N/[NAGCT]/xmsg; # Random bases can be called as N + $re =~ s/B/[GCT]/xmsg; + $re =~ s/D/[AGT]/xmsg; + $re =~ s/H/[ACT]/xmsg; + $re =~ s/V/[AGC]/xmsg; + $re =~ s/R/[AG]/xmsg; + $re =~ s/Y/[CT]/xmsg; + $re =~ s/K/[GT]/xmsg; + $re =~ s/M/[AC]/xmsg; + $re =~ s/S/[GC]/xmsg; + $re =~ s/W/[AT]/xmsg; + push @{ $re_for{$tag} }, qr/\A $re \Z/xms; + } + } + + return %re_for; +} + +1; diff --git a/lib/DETCT/Pipeline.pm b/lib/DETCT/Pipeline.pm new file mode 100644 index 0000000..8f0c925 --- /dev/null +++ b/lib/DETCT/Pipeline.pm @@ -0,0 +1,1280 @@ +## no critic (RequireUseStrict, RequireUseWarnings, RequireTidyCode) +package DETCT::Pipeline; +## use critic + +# ABSTRACT: Object representing a pipeline + +## Author : is1 +## Maintainer : is1 +## Created : 2013-01-09 +## Last commit by : $Author$ +## Last modified : $Date$ +## Revision : $Revision$ +## Repository URL : $HeadURL$ + +use warnings; +use strict; +use autodie; +use Carp; +use Try::Tiny; + +use Readonly; +use Class::InsideOut qw( private register id ); +use Scalar::Util qw( refaddr ); +use English qw( -no_match_vars ); +use POSIX qw( WIFEXITED WIFSIGNALED WTERMSIG ); +use File::Slurp; +use File::Spec; +use File::Path qw( make_path ); +use Hash::Merge; +use YAML::Tiny qw( DumpFile ); +use Sys::Hostname; +use File::Basename; +use File::Find; +use DETCT; +use DETCT::Pipeline::Job; +use DETCT::Pipeline::Stage; + +=head1 SYNOPSIS + + # Brief code examples + +=cut + +# Attributes: +private scheduler => my %scheduler; # e.g. lsf +private analysis_dir => my %analysis_dir; # e.g. . +private analysis => my %analysis; # DETCT::Analysis object +private cmd_line => my %cmd_line; # e.g. run_de_pipeline.pl +private max_retries => my %max_retries; # e.g. 10 +private sleep_time => my %sleep_time; # e.g. 600 +private stage_to_run => my %stage_to_run; # DETCT::Pipeline::Stage object +private component_to_run => my %component_to_run; # e.g. 5 +private verbose => my %verbose; # e.g. 1 +private hash_merge => my %hash_merge; # Hash::Merge object +private stage => my %stage; # arrayref of stages + +# Constants +Readonly our %EXTENSION_TO_KEEP => map { $_ => 1 } qw( + csv html pdf tsv txt +); + +=method new + + Usage : my $pipeline = DETCT::Pipeline->new( { + scheduler => 'lsf', + analysis_dir => '.', + analysis => $analysis, + cmd_line => 'run_de_pipeline.pl', + max_retries => 10, + sleep_time => 600, + verbose => 1, + } ); + Purpose : Constructor for pipeline objects + Returns : DETCT::Pipeline + Parameters : Hashref { + scheduler => String, + analysis_dir => String, + analysis => DETCT::Analysis, + cmd_line => String, + max_retries => Int, + sleep_time => Int, + verbose => Boolean or undef + } + Throws : No exceptions + Comments : None + +=cut + +sub new { + my ( $class, $arg_ref ) = @_; + my $self = register($class); + $self->set_scheduler( $arg_ref->{scheduler} ); + $self->set_analysis_dir( $arg_ref->{analysis_dir} ); + $self->set_analysis( $arg_ref->{analysis} ); + $self->set_cmd_line( $arg_ref->{cmd_line} ); + $self->set_max_retries( $arg_ref->{max_retries} ); + $self->set_sleep_time( $arg_ref->{sleep_time} ); + $self->set_verbose( $arg_ref->{verbose} ); + return $self; +} + +=method scheduler + + Usage : my $scheduler = $pipeline->scheduler; + Purpose : Getter for scheduler attribute + Returns : String (e.g. "lsf") + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub scheduler { + my ($self) = @_; + return $scheduler{ id $self}; +} + +=method set_scheduler + + Usage : $pipeline->set_scheduler('lsf'); + Purpose : Setter for scheduler attribute + Returns : undef + Parameters : String (the scheduler) + Throws : No exceptions + Comments : None + +=cut + +sub set_scheduler { + my ( $self, $arg ) = @_; + $scheduler{ id $self} = _check_scheduler($arg); + return; +} + +# Usage : $scheduler = _check_scheduler($scheduler); +# Purpose : Check for valid scheduler +# Returns : String (the valid scheduler) +# Parameters : String (the scheduler) +# Throws : If scheduler is not lsf or local +# Comments : None +sub _check_scheduler { + my ($scheduler) = @_; + + confess 'Invalid scheduler specified' + if !defined $scheduler + || ( $scheduler ne 'lsf' && $scheduler ne 'local' ); + + return $scheduler; +} + +=method analysis_dir + + Usage : my $analysis_dir = $pipeline->analysis_dir; + Purpose : Getter for analysis directory attribute + Returns : String (e.g. ".") + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub analysis_dir { + my ($self) = @_; + return $analysis_dir{ id $self}; +} + +=method set_analysis_dir + + Usage : $pipeline->set_analysis_dir('.'); + Purpose : Setter for analysis directory attribute + Returns : undef + Parameters : String (the analysis directory) + Throws : No exceptions + Comments : None + +=cut + +sub set_analysis_dir { + my ( $self, $arg ) = @_; + $analysis_dir{ id $self} = _check_analysis_dir($arg); + return; +} + +# Usage : $analysis_dir = _check_analysis_dir($analysis_dir); +# Purpose : Check for valid analysis directory +# Returns : String (the valid analysis directory) +# Parameters : String (the analysis directory) +# Throws : If analysis directory is missing or invalid +# Comments : None +sub _check_analysis_dir { + my ($analysis_dir) = @_; + + # Make sure analysis directory exists + if ( defined $analysis_dir && !-d $analysis_dir ) { + make_path($analysis_dir); + } + + return $analysis_dir if defined $analysis_dir && -d $analysis_dir; + confess 'No analysis_dir specified' if !defined $analysis_dir; + confess "Invalid analysis_dir ($analysis_dir) specified"; +} + +=method analysis + + Usage : my $analysis = $pipeline->analysis; + Purpose : Getter for analysis attribute + Returns : DETCT::Analysis + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub analysis { + my ($self) = @_; + return $analysis{ id $self}; +} + +=method set_analysis + + Usage : $pipeline->set_analysis($analysis); + Purpose : Setter for analysis attribute + Returns : undef + Parameters : DETCT::Analysis + Throws : No exceptions + Comments : None + +=cut + +sub set_analysis { + my ( $self, $arg ) = @_; + $analysis{ id $self} = _check_analysis($arg); + return; +} + +# Usage : $analysis = _check_analysis($analysis); +# Purpose : Check for valid analysis object +# Returns : DETCT::Analysis +# Parameters : DETCT::Analysis +# Throws : If analysis object is missing or invalid (i.e. not a +# DETCT::Analysis object) +# Comments : None +sub _check_analysis { + my ($analysis) = @_; + return $analysis if defined $analysis && $analysis->isa('DETCT::Analysis'); + confess 'No analysis specified' if !defined $analysis; + confess 'Class of analysis (', ref $analysis, ') not DETCT::Analysis'; +} + +=method cmd_line + + Usage : my $cmd_line = $pipeline->cmd_line; + Purpose : Getter for command line attribute + Returns : String (e.g. "run_de_pipeline.pl") + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub cmd_line { + my ($self) = @_; + return $cmd_line{ id $self}; +} + +=method set_cmd_line + + Usage : $pipeline->set_cmd_line('run_de_pipeline.pl'); + Purpose : Setter for command line attribute + Returns : undef + Parameters : String (the command line) + Throws : No exceptions + Comments : None + +=cut + +sub set_cmd_line { + my ( $self, $arg ) = @_; + $cmd_line{ id $self} = _check_cmd_line($arg); + return; +} + +# Usage : $cmd_line = _check_cmd_line($cmd_line); +# Purpose : Check for valid command line +# Returns : String (the valid command line) +# Parameters : String (the command line) +# Throws : If command line is missing +# Comments : None +sub _check_cmd_line { + my ($cmd_line) = @_; + + confess 'No command line specified' if !defined $cmd_line || !$cmd_line; + + return $cmd_line; +} + +=method max_retries + + Usage : my $max_retries = $pipeline->max_retries; + Purpose : Getter for max retries attribute + Returns : +ve Int + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub max_retries { + my ($self) = @_; + return $max_retries{ id $self}; +} + +=method set_max_retries + + Usage : $pipeline->set_max_retries(10); + Purpose : Setter for max retries attribute + Returns : undef + Parameters : +ve Int (the max retries) + Throws : No exceptions + Comments : None + +=cut + +sub set_max_retries { + my ( $self, $arg ) = @_; + $max_retries{ id $self} = _check_max_retries($arg); + return; +} + +# Usage : $max_retries = _check_max_retries($max_retries); +# Purpose : Check for valid max retries +# Returns : +ve Int (the valid max retries) +# Parameters : +ve Int (the max retries) +# Throws : If max retries is missing or not a positive integer +# Comments : None +sub _check_max_retries { + my ($max_retries) = @_; + return $max_retries + if defined $max_retries && $max_retries =~ m/\A \d+ \z/xms; + confess 'No max retries specified' if !defined $max_retries; + confess "Invalid max retries ($max_retries) specified"; +} + +=method sleep_time + + Usage : my $sleep_time = $pipeline->sleep_time; + Purpose : Getter for sleep time attribute + Returns : +ve Int + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub sleep_time { + my ($self) = @_; + return $sleep_time{ id $self}; +} + +=method set_sleep_time + + Usage : $pipeline->set_sleep_time(600); + Purpose : Setter for sleep time attribute + Returns : undef + Parameters : +ve Int (the sleep time) + Throws : No exceptions + Comments : None + +=cut + +sub set_sleep_time { + my ( $self, $arg ) = @_; + $sleep_time{ id $self} = _check_sleep_time($arg); + return; +} + +# Usage : $sleep_time = _check_sleep_time($sleep_time); +# Purpose : Check for valid sleep time +# Returns : +ve Int (the valid sleep time) +# Parameters : +ve Int (the sleep time) +# Throws : If sleep time is missing or not a positive integer +# Comments : None +sub _check_sleep_time { + my ($sleep_time) = @_; + return $sleep_time + if defined $sleep_time && $sleep_time =~ m/\A \d+ \z/xms; + confess 'No sleep time specified' if !defined $sleep_time; + confess "Invalid sleep time ($sleep_time) specified"; +} + +=method stage_to_run + + Usage : my $stage = $pipeline->stage_to_run; + Purpose : Getter for stage to be run attribute + Returns : DETCT::Pipeline::Stage + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub stage_to_run { + my ($self) = @_; + return $stage_to_run{ id $self}; +} + +=method set_stage_to_run + + Usage : $pipeline->set_stage_to_run($stage); + Purpose : Setter for stage to be run attribute + Returns : undef + Parameters : DETCT::Pipeline::Stage + Throws : No exceptions + Comments : None + +=cut + +sub set_stage_to_run { + my ( $self, $arg ) = @_; + $stage_to_run{ id $self} = _check_stage_to_run($arg); + return; +} + +# Usage : $stage = _check_stage_to_run($stage); +# Purpose : Check for valid stage to be run object +# Returns : DETCT::Pipeline::Stage +# Parameters : DETCT::Pipeline::Stage +# Throws : If stage to be run object is missing or invalid (i.e. not a +# DETCT::Pipeline::Stage object) +# Comments : None +sub _check_stage_to_run { + my ($stage_to_run) = @_; + return $stage_to_run + if defined $stage_to_run && $stage_to_run->isa('DETCT::Pipeline::Stage'); + confess 'No stage to be run specified' if !defined $stage_to_run; + confess 'Class of stage to be run (', ref $stage_to_run, + ') not DETCT::Pipeline::Stage'; +} + +=method component_to_run + + Usage : my $component = $pipeline->component_to_run; + Purpose : Getter for component to be run attribute + Returns : +ve Int + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub component_to_run { + my ($self) = @_; + return $component_to_run{ id $self}; +} + +=method set_component_to_run + + Usage : $pipeline->set_component_to_run(5); + Purpose : Setter for component to be run attribute + Returns : undef + Parameters : +ve Int (the component to be run) + Throws : No exceptions + Comments : None + +=cut + +sub set_component_to_run { + my ( $self, $arg ) = @_; + $component_to_run{ id $self} = _check_component_to_run($arg); + return; +} + +# Usage : $component = _check_component_to_run($component); +# Purpose : Check for valid component to be run +# Returns : +ve Int (the valid component to be run) +# Parameters : +ve Int (the component to be run) +# Throws : If component to be run is missing or not a positive integer +# Comments : None +sub _check_component_to_run { + my ($component_to_run) = @_; + return $component_to_run + if defined $component_to_run && $component_to_run =~ m/\A \d+ \z/xms; + confess 'No component to be run specified' if !defined $component_to_run; + confess "Invalid component to be run ($component_to_run) specified"; +} + +=method verbose + + Usage : my $verbose = $pipeline->verbose; + Purpose : Getter for verbose flag + Returns : Boolean + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub verbose { + my ($self) = @_; + return $verbose{ id $self} || 0; +} + +=method set_verbose + + Usage : $pipeline->set_verbose(1); + Purpose : Setter for verbose flag + Returns : undef + Parameters : Boolean + Throws : No exceptions + Comments : None + +=cut + +sub set_verbose { + my ( $self, $arg ) = @_; + $verbose{ id $self} = $arg ? 1 : 0; + return; +} + +=method hash_merge + + Usage : %chunk_hmm + = %{ $pipeline->hash_merge->merge(\%chunk_hmm, $seq_hmm) }; + Purpose : Return a Hash::Merge object for merging job output + Returns : Hash::Merge + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub hash_merge { + my ($self) = @_; + + if ( !exists $hash_merge{ id $self} ) { + ## no critic (ProtectPrivateSubs) + Hash::Merge::specify_behavior( + { + SCALAR => { + SCALAR => sub { $_[0] + $_[1] }, # Add scalars + ARRAY => sub { undef }, + HASH => sub { undef }, + }, + ARRAY => { + SCALAR => sub { undef }, + ARRAY => sub { [ @{ $_[0] }, @{ $_[1] } ] }, # Join arrays + HASH => sub { undef }, + }, + HASH => { + SCALAR => sub { undef }, + ARRAY => sub { undef }, + HASH => sub { Hash::Merge::_merge_hashes( $_[0], $_[1] ) }, + }, + }, + 'detct', + ); + ## use critic + $hash_merge{ id $self} = Hash::Merge->new('detct'); + } + + return $hash_merge{ id $self}; +} + +=method add_stages_from_yaml + + Usage : $pipeline->add_stages_from_yaml( 'detct.yaml' ); + Purpose : Add stages from a YAML file + Returns : undef + Parameters : String (the YAML file) + Throws : If YAML file is missing or not readable or invalid + Comments : None + +=cut + +sub add_stages_from_yaml { + my ( $self, $yaml_file ) = @_; + + confess "YAML file ($yaml_file) does not exist or cannot be read" + if !-r $yaml_file; + + my $yaml = YAML::Tiny->read($yaml_file); + + if ( !$yaml ) { + confess sprintf 'YAML file (%s) is invalid: %s', $yaml_file, + YAML::Tiny->errstr; + } + + my %tmp_cache; # Temporarily store stages by name + + foreach my $stage_hash ( @{ $yaml->[0] } ) { + my $stage = DETCT::Pipeline::Stage->new( + { + name => $stage_hash->{name}, + default_memory => $stage_hash->{default_memory}, + } + ); + foreach my $prerequisite_name ( @{ $stage_hash->{prerequisites} } ) { + $stage->add_prerequisite( $tmp_cache{$prerequisite_name} ); + } + $self->add_stage($stage); + + $tmp_cache{ $stage_hash->{name} } = $stage; + } + + return; +} + +=method add_stage + + Usage : $pipeline->add_stage($stage); + Purpose : Add a stage to a pipeline + Returns : undef + Parameters : DETCT::Pipeline::Stage + Throws : If stage is missing or invalid (i.e. not a + DETCT::Pipeline::Stage object) + Comments : None + +=cut + +sub add_stage { + my ( $self, $stage ) = @_; + + confess 'No stage specified' if !defined $stage; + confess 'Class of stage (', ref $stage, ') not DETCT::Pipeline::Stage' + if !$stage->isa('DETCT::Pipeline::Stage'); + + if ( !exists $stage{ id $self} ) { + $stage{ id $self} = [$stage]; + } + else { + push @{ $stage{ id $self} }, $stage; + } + + return; +} + +=method get_all_stages + + Usage : $stages = $pipeline->get_all_stages(); + Purpose : Get all stages of a pipeline + Returns : Arrayref of DETCT::Pipeline::Stage objects + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub get_all_stages { + my ($self) = @_; + + return $stage{ id $self} || []; +} + +=method get_stage_by_name + + Usage : $stage = $pipeline->get_stage_by_name('run_deseq'); + Purpose : Get a named stage of a pipeline + Returns : DETCT::Pipeline::Stage + Parameters : String (the stage name) + Throws : If stage with specified name does not exist + Comments : None + +=cut + +sub get_stage_by_name { + my ( $self, $name ) = @_; + + foreach my $stage ( @{ $stage{ id $self} } ) { + return $stage if $stage->name eq $name; + } + + confess "Invalid stage name ($name)"; +} + +=method run + + Usage : $pipeline->run(); + Purpose : Run pipeline + Returns : undef + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub run { + my ($self) = @_; + + $self->init_run(); + + my $all_stages_run = 0; + + while ( !$all_stages_run ) { + $all_stages_run = 1; + + my $jobs_running_or_run = 0; + + # Iterate over all stages of pipeline + STAGE: foreach my $stage ( @{ $self->get_all_stages() } ) { + + # Check prerequisites have already run and skip this stage if not + foreach my $prereq_stage ( @{ $stage->get_all_prerequisites() } ) { + if ( !$prereq_stage->all_jobs_run ) { + $self->say_if_verbose( + sprintf 'Skipping %s because %s not run', + $stage->name, $prereq_stage->name ); + next STAGE; + } + } + + # Create directory for current stage of analysis + my $dir = $self->get_and_create_stage_dir($stage); + + # Assume all jobs have run OK until we know otherwise + $stage->set_all_jobs_run(1); + + # All jobs marked as having run OK? + my $done_marker_file = $dir . '.done'; + if ( -e $done_marker_file ) { + $self->say_if_verbose( sprintf 'Stage %s has finished', + $stage->name ); + next STAGE; + } + + # Running a specific stage, but not this one + if ( $self->stage_to_run + && refaddr( $self->stage_to_run ) != refaddr($stage) ) + { + next STAGE; + } + + # Get all parameters for all components of current stage + my @all_parameters = $self->all_parameters($stage); + + $self->say_if_verbose( sprintf 'Stage %s has %d components', + $stage->name, scalar @all_parameters ); + + my $component = 0; # Index for current component of current stage + foreach my $parameters (@all_parameters) { + $component++; + + # Running a specific component, but not this one + if ( + $self->stage_to_run + && $self->component_to_run + && ( refaddr( $self->stage_to_run ) != refaddr($stage) + || $self->component_to_run != $component ) + ) + { + next; + } + + my $job = DETCT::Pipeline::Job->new( + { + stage => $stage, + component => $component, + scheduler => $self->scheduler, + base_filename => + File::Spec->catfile( $dir, $component ), + parameters => $parameters, + } + ); + + # Run job if running a specific component of a specific stage + if ( $self->stage_to_run && $self->component_to_run ) { + $self->run_job($job); + return; + } + + $jobs_running_or_run += $self->process_job($job); + } + + if ( $stage->all_jobs_run ) { + write_file( $done_marker_file, '1' ); + } + else { + $all_stages_run = 0; + } + } + + if ( !$all_stages_run && !$jobs_running_or_run ) { + $self->_delete_lock(); + die 'Stopping pipeline - no jobs to run' . "\n"; + } + + if ( !$all_stages_run ) { + $self->say_if_verbose( sprintf 'Sleeping for %d seconds', + $self->sleep_time ); + sleep $self->sleep_time; + } + } + + print 'Pipeline finished - all jobs run' . "\n"; + + $self->clean_up(); + + $self->_delete_lock(); + + return; +} + +=method init_run + + Usage : $self->init_run(); + Purpose : Initialise a pipeline run + Returns : undef + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub init_run { + my ($self) = @_; + + if ( !$self->stage_to_run && !$self->component_to_run ) { + ## no critic (RequireLocalizedPunctuationVars) + $SIG{INT} = sub { + $self->_delete_lock(); + die "\n" . 'Interrupted' . "\n"; + }; + ## use critic + $self->_create_lock(); + } + + return; +} + +=method get_and_create_stage_dir + + Usage : my $dir = $pipeline->get_and_create_stage_dir( $stage ); + Purpose : Get (and create if necessary) a directory for the current stage + Returns : String (the directory) + Parameters : DETCT::Pipeline::Stage + Throws : No exceptions + Comments : None + +=cut + +sub get_and_create_stage_dir { + my ( $self, $stage ) = @_; + + my $stage_dir = File::Spec->catdir( $self->analysis_dir, $stage->name ); + if ( !-d $stage_dir ) { + make_path($stage_dir); + } + + return $stage_dir; +} + +=method get_and_check_output_file + + Usage : my $file = $pipeline->get_and_check_output_file('run_deseq', 1); + Purpose : Get an output file for a particular component of a stage + Returns : String (the file) + Parameters : String (the stage) + Int (the component) + Throws : If output file doesn't exist + Comments : None + +=cut + +sub get_and_check_output_file { + my ( $self, $stage_name, $component ) = @_; + + my $output_file = File::Spec->catfile( $self->analysis_dir, $stage_name, + $component . '.out' ); + if ( !-e $output_file ) { + confess "$output_file doesn't exist, but should"; + } + + return $output_file; +} + +=method process_job + + Usage : $jobs_running_or_run += $pipeline->process_job($job); + Purpose : Process a job to see if needs to be submitted + Returns : Boolean + Parameters : DETCT::Pipeline::Job + Throws : No exceptions + Comments : Returns whether or not job has been run or was already running + +=cut + +sub process_job { + my ( $self, $job ) = @_; + + my $job_running_or_run = 0; + + if ( $job->status_code eq 'NOT_RUN' ) { + + # Job not yet run so submit it + $job->stage->set_all_jobs_run(0); + $self->say_if_verbose( sprintf ' Running component %d of %s', + $job->component, $job->stage->name ); + $job_running_or_run = 1; + $self->submit_job($job); + } + elsif ( $job->status_code eq 'RUNNING' ) { + + # Job is running + $job->stage->set_all_jobs_run(0); + $self->say_if_verbose( sprintf ' Component %d of %s is still running', + $job->component, $job->stage->name ); + $job_running_or_run = 1; + } + elsif ( $job->status_code eq 'FAILED' ) { + + # Job has failed, so submit again + $job->stage->set_all_jobs_run(0); + $self->say_if_verbose( sprintf ' Component %d of %s has FAILED: %s', + $job->component, $job->stage->name, $job->status_text ); + if ( $job->retries < $self->max_retries ) { + $self->say_if_verbose( sprintf ' Running component %d of %s', + $job->component, $job->stage->name ); + $job_running_or_run = 1; + $self->submit_job($job); + } + else { + $self->say_if_verbose( + sprintf + ' Not running component %d of %s because retried %d times', + $job->component, $job->stage->name, $job->retries ); + } + } + + return $job_running_or_run; +} + +=method submit_job + + Usage : $pipeline->submit_job($job); + Purpose : Submit a job + Returns : undef + Parameters : DETCT::Pipeline::Job + Throws : If job or bsub can't be run + If job id can't be extracted from bsub output + Comments : None + +=cut + +sub submit_job { + my ( $self, $job ) = @_; + + my $stdout_file = $job->base_filename . '.o'; + my $stderr_file = $job->base_filename . '.e'; + + my $cmd = + $self->cmd_line + . ' --stage ' + . $job->stage->name + . ' --component ' + . $job->component; + + if ( $job->scheduler eq 'local' ) { + + # Just run job + $cmd .= ' 1>' . $stdout_file; + $cmd .= ' 2>' . $stderr_file; + my $cmd_status = system $cmd; + + # Die if the command was interrupted + if ( WIFSIGNALED($cmd_status) && WTERMSIG($cmd_status) == 2 ) { + $self->_delete_lock(); + die "\n" . 'Interrupted' . "\n"; + } + + # Die if the command couldn't be run + confess "Couldn't run $cmd ($OS_ERROR)" if !WIFEXITED($cmd_status); + + if ( defined $job->retries ) { + $job->set_retries( $job->retries + 1 ); + } + else { + $job->set_retries(0); + } + my $dump = { retries => $job->retries, }; + my $job_file = $job->base_filename . '.job'; + DumpFile( $job_file, $dump ); + } + elsif ( $job->scheduler eq 'lsf' ) { + + # Either use default memory or increase by 50% (if retrying failed job) + if ( !$job->memory ) { + $job->set_memory( $job->stage->default_memory ); + } + elsif ( $job->status_text =~ m/\A MEMLIMIT /xms ) { + ## no critic (ProhibitMagicNumbers) + $job->set_memory( int( $job->memory * 1.5 ) ); + ## use critic + } + + # bsub job + my $bsub_stdout_file = $job->base_filename . '.bsub.o'; + my $bsub_stderr_file = $job->base_filename . '.bsub.e'; + ## no critic (ProhibitMagicNumbers) + my $memory_clause = sprintf q{ -R'select[mem>%d] rusage[mem=%d]' -M%d }, + $job->memory, $job->memory, $job->memory * 1000; + ## use critic + $cmd = + 'bsub' . ' -oo ' + . $stdout_file . ' -eo ' + . $stderr_file + . $memory_clause + . $cmd . ' 1>' + . $bsub_stdout_file . ' 2>' + . $bsub_stderr_file; + WIFEXITED( system $cmd) or confess "Couldn't run $cmd ($OS_ERROR)"; + + # Extract job id from bsub output and store along with other parameters + my $bsub_stdout = read_file($bsub_stdout_file); + if ( $bsub_stdout =~ m/Job \s <(\d+)> \s is \s submitted/xms ) { + my $id = $1; + if ( defined $job->retries ) { + $job->set_retries( $job->retries + 1 ); + } + else { + $job->set_retries(0); + } + my $dump = { + id => $id, + retries => $job->retries, + memory => $job->memory, + }; + my $job_file = $job->base_filename . '.job'; + DumpFile( $job_file, $dump ); + } + else { + confess "Couldn't get job id from $bsub_stdout_file"; + } + } + + return; +} + +=method all_parameters + + Usage : @all_parameters = $pipeline->all_parameters( $stage ); + Purpose : Get all the parameters for a stage + Returns : Array + Parameters : DETCT::Pipeline::Stage + Throws : No exceptions + Comments : This function calls the all_parameters_for_ method associated + with the current stage and gets all the parameters for that + stage as an array of arbitrary data (e.g. arrayref or scalar) + +=cut + +sub all_parameters { + my ( $self, $stage ) = @_; + + my $sub_name = 'all_parameters_for_' . $stage->name; + + return $self->$sub_name(); +} + +=method run_job + + Usage : $pipeline->run_job($job); + Purpose : Run a job + Returns : undef + Parameters : DETCT::Pipeline::Job + Throws : No exceptions + Comments : This function calls the run_ method associated with the current + stage and passes along the parameters for the current component, + which are arbitrary (e.g. arrayref or scalar) + +=cut + +sub run_job { + my ( $self, $job ) = @_; + + my $sub_name = 'run_' . $job->stage->name; + + $self->$sub_name($job); + + return; +} + +=method input_overview + + Usage : $pipeline->say_if_verbose($pipeline->input_overview); + Purpose : Return textual overview of pipeline's input + Returns : Array of Strings + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub input_overview { + my ($self) = @_; + + my @output; + + push @output, 'Command line:', $self->cmd_line; + if ( defined $DETCT::VERSION ) { + push @output, 'DETCT version:' . $DETCT::VERSION; + } + push @output, 'Working directory: ' . $self->analysis_dir; + + push @output, 'BAM files: ' . join q{ }, + $self->analysis->list_all_bam_files(); + + push @output, sprintf 'Number of samples: %d', + scalar @{ $self->analysis->get_all_samples }; + push @output, sprintf 'Number of sequences: %d', + scalar @{ $self->analysis->get_all_sequences }; + push @output, sprintf 'Number of chunks: %d', $self->analysis->chunk_total; + + push @output, 'Number of sequences per chunk:'; + my $chunk_component = 0; + foreach my $chunk ( @{ $self->analysis->get_all_chunks } ) { + $chunk_component++; + push @output, sprintf " Chunk $chunk_component: %d sequences", + scalar @{$chunk}; + } + + return @output; +} + +=method say_if_verbose + + Usage : $pipeline->say_if_verbose( 'Command line:', $cmd_line ); + Purpose : Print output if pipeline is set to verbose + Returns : undef + Parameters : Array of Strings + Throws : No exceptions + Comments : Each string is a line without carriage returns or newlines + +=cut + +sub say_if_verbose { + my ( $self, @output ) = @_; + if ( $self->verbose ) { + print join "\n", @output; + print "\n"; + } + return; +} + +=method write_log_file + + Usage : $pipeline->write_log_file( @output ); + Purpose : Write data to a specified log file + Returns : undef + Parameters : String (the filename) + Array of Strings + Throws : No exceptions + Comments : None + +=cut + +sub write_log_file { + my ( $self, $filename, @output ) = @_; + + my $log_file = File::Spec->catfile( $self->analysis_dir, $filename ); + write_file( $log_file, @output ); + + return; +} + +# Usage : $self->_create_lock(); +# Purpose : Create lock file +# Returns : undef +# Parameters : None +# Throws : If lock file already exists +# Comments : None +sub _create_lock { + my ($self) = @_; + + my $lock_file = File::Spec->catfile( $self->analysis_dir, 'pipeline.lock' ); + + if ( -e $lock_file ) { + my $message = + "\nERROR: Is another pipeline running?\n" + . "Make sure before deleting $lock_file and restarting.\n" + . "Lock file contains:\n\n"; + $message .= read_file($lock_file); + die $message . "\n"; + } + + my $hostname = hostname(); + my $timestamp = localtime; + write_file( $lock_file, $hostname . "\n" . $timestamp . "\n" ); + + return; +} + +# Usage : $self->_delete_lock(); +# Purpose : Delete lock file +# Returns : undef +# Parameters : None +# Throws : No exceptions +# Comments : None +sub _delete_lock { + my ($self) = @_; + + my $lock_file = File::Spec->catfile( $self->analysis_dir, 'pipeline.lock' ); + + unlink $lock_file; + + return; +} + +=method clean_up + + Usage : $self->clean_up(); + Purpose : Move results, archive stages and delete data + Returns : undef + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub clean_up { + my ($self) = @_; + + # Already cleaned up? + my $done_marker_file = + File::Spec->catfile( $self->analysis_dir, 'cleanup.done' ); + return if -e $done_marker_file; + + print 'Cleaning up...' . "\n"; + + # Tar all stages + my @stage_dirs = + map { $self->get_and_create_stage_dir($_) } @{ $self->get_all_stages() }; + my $tarball_file = + File::Spec->catfile( $self->analysis_dir, 'archive.tar.gz' ); + my $cmd = join q{ }, 'tar', 'cf', q{-}, @stage_dirs, q{|}, 'gzip', '-9', + '-c', q{>}, $tarball_file; + WIFEXITED( system $cmd) or confess "Couldn't run $cmd ($OS_ERROR)"; + + # Delete or move files + my $wanted = \&_move_or_delete; + find( + { + wanted => sub { $wanted->( $self->analysis_dir ) }, + postprocess => sub { rmdir $File::Find::dir }, + no_chdir => 1, + }, + @stage_dirs + ); + + write_file( $done_marker_file, '1' ); + + print 'Done' . "\n"; + + return; +} + +# Usage : find(\&_move_or_delete, $dir); +# Purpose : Move results files and delete other files +# Returns : undef +# Parameters : None +# Throws : No exceptions +# Comments : None +sub _move_or_delete { + my ($archive_dir) = @_; + + return if -d; # Ignore directories + + my ( $filename, undef, $extension ) = fileparse($File::Find::name); + + # Move or delete? + if ( $EXTENSION_TO_KEEP{$extension} ) { + rename $File::Find::name, + File::Spec->catfile( $archive_dir, $filename ); + } + else { + unlink $File::Find::name; + } + + return; +} + +1; diff --git a/lib/DETCT/Pipeline/Job.pm b/lib/DETCT/Pipeline/Job.pm new file mode 100644 index 0000000..ce8f348 --- /dev/null +++ b/lib/DETCT/Pipeline/Job.pm @@ -0,0 +1,697 @@ +## no critic (RequireUseStrict, RequireUseWarnings, RequireTidyCode) +package DETCT::Pipeline::Job; +## use critic + +# ABSTRACT: Object representing a pipeline job + +## Author : is1 +## Maintainer : is1 +## Created : 2013-01-17 +## Last commit by : $Author$ +## Last modified : $Date$ +## Revision : $Revision$ +## Repository URL : $HeadURL$ + +use warnings; +use strict; +use autodie; +use Carp; +use Try::Tiny; + +use Readonly; +use Class::InsideOut qw( private register id ); +use English qw( -no_match_vars ); +use File::ReadBackwards; +use YAML::Tiny qw( LoadFile ); + +=head1 SYNOPSIS + + # Brief code examples + +=cut + +# Attributes: +private stage => my %stage; # DETCT::Pipeline::Stage object +private component => my %component; # e.g. 2 +private scheduler => my %scheduler; # e.g. lsf +private base_filename => my %base_filename; # e.g. ./run_deseq/1 +private parameters => my %parameters; # e.g. arrayref or scalar +private retries => my %retries; # e.g. 5 +private memory => my %memory; # e.g. 3000 +private status_code => my %status_code; # e.g. DONE +private status_text => my %status_text; # e.g. Job killed by owner + +# Constants +Readonly our %STATUS_FOR => ( + PEND => 'RUNNING', + PSUSP => 'RUNNING', + RUN => 'RUNNING', + USUSP => 'RUNNING', + SSUSP => 'RUNNING', + WAIT => 'RUNNING', + EXIT => 'FAILED', + UNKWN => 'FAILED', + ZOMBI => 'FAILED', + DONE => 'DONE', +); + +=method new + + Usage : my $job = DETCT::Pipeline::Job->new( { + stage => $stage, + component => 2, + scheduler => 'lsf', + base_filename => './run_deseq/1', + parameters => $parameters, + } ); + Purpose : Constructor for job objects + Returns : DETCT::Pipeline::Job + Parameters : Hashref { + stage => DETCT::Pipeline::Stage, + component => Int, + scheduler => String, + base_filename => String, + parameters => Any (probably arrayref or scalar) + } + Throws : No exceptions + Comments : None + +=cut + +sub new { + my ( $class, $arg_ref ) = @_; + my $self = register($class); + $self->set_stage( $arg_ref->{stage} ); + $self->set_component( $arg_ref->{component} ); + $self->set_scheduler( $arg_ref->{scheduler} ); + $self->set_base_filename( $arg_ref->{base_filename} ); + $self->set_parameters( $arg_ref->{parameters} ); + $self->set_state_from_filesystem(); + return $self; +} + +=method stage + + Usage : my $stage = $job->stage; + Purpose : Getter for stage attribute + Returns : DETCT::Pipeline::Stage + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub stage { + my ($self) = @_; + return $stage{ id $self}; +} + +=method set_stage + + Usage : $job->set_stage($stage); + Purpose : Setter for stage attribute + Returns : undef + Parameters : DETCT::Pipeline::Stage + Throws : No exceptions + Comments : None + +=cut + +sub set_stage { + my ( $self, $arg ) = @_; + $stage{ id $self} = _check_stage($arg); + return; +} + +# Usage : $stage = _check_stage($stage); +# Purpose : Check for valid stage object +# Returns : DETCT::Pipeline::Stage +# Parameters : DETCT::Pipeline::Stage +# Throws : If stage object is missing or invalid (i.e. not a +# DETCT::Pipeline::Stage object) +# Comments : None +sub _check_stage { + my ($stage) = @_; + return $stage if defined $stage && $stage->isa('DETCT::Pipeline::Stage'); + confess 'No stage specified' if !defined $stage; + confess 'Class of stage (', ref $stage, ') not DETCT::Pipeline::Stage'; +} + +=method component + + Usage : my $component = $job->component; + Purpose : Getter for component attribute + Returns : +ve Int + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub component { + my ($self) = @_; + return $component{ id $self}; +} + +=method set_component + + Usage : $job->set_component(2); + Purpose : Setter for component attribute + Returns : undef + Parameters : +ve Int (the component) + Throws : No exceptions + Comments : None + +=cut + +sub set_component { + my ( $self, $arg ) = @_; + $component{ id $self} = _check_component($arg); + return; +} + +# Usage : $component = _check_component($component); +# Purpose : Check for valid component +# Returns : +ve Int (the valid component) +# Parameters : +ve Int (the component) +# Throws : If component is missing or not a positive integer +# Comments : None +sub _check_component { + my ($component) = @_; + return $component if defined $component && $component =~ m/\A \d+ \z/xms; + confess 'No component specified' if !defined $component; + confess "Invalid component ($component) specified"; +} + +=method scheduler + + Usage : my $scheduler = $job->scheduler; + Purpose : Getter for scheduler attribute + Returns : String (e.g. "lsf") + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub scheduler { + my ($self) = @_; + return $scheduler{ id $self}; +} + +=method set_scheduler + + Usage : $job->set_scheduler('lsf'); + Purpose : Setter for scheduler attribute + Returns : undef + Parameters : String (the scheduler) + Throws : No exceptions + Comments : None + +=cut + +sub set_scheduler { + my ( $self, $arg ) = @_; + $scheduler{ id $self} = _check_scheduler($arg); + return; +} + +# Usage : $scheduler = _check_scheduler($scheduler); +# Purpose : Check for valid scheduler +# Returns : String (the valid scheduler) +# Parameters : String (the scheduler) +# Throws : If scheduler is not lsf or local +# Comments : None +sub _check_scheduler { + my ($scheduler) = @_; + + confess 'Invalid scheduler specified' + if !defined $scheduler + || ( $scheduler ne 'lsf' && $scheduler ne 'local' ); + + return $scheduler; +} + +=method base_filename + + Usage : my $base_filename = $job->base_filename; + Purpose : Getter for the base filename attribute + Returns : String (e.g. "./run_deseq/1") + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub base_filename { + my ($self) = @_; + return $base_filename{ id $self}; +} + +=method set_base_filename + + Usage : $job->set_base_filename('./run_deseq/1'); + Purpose : Setter for the base filename attribute + Returns : undef + Parameters : String (the base filename) + Throws : No exceptions + Comments : None + +=cut + +sub set_base_filename { + my ( $self, $arg ) = @_; + $base_filename{ id $self} = _check_base_filename($arg); + return; +} + +# Usage : $base_filename = _check_base_filename($base_filename); +# Purpose : Check for valid base filename +# Returns : String (the valid base filename) or undef +# Parameters : String (the base filename) +# Throws : If base filename is missing +# Comments : None +sub _check_base_filename { + my ($base_filename) = @_; + + confess 'No base filename specified' + if !defined $base_filename || !$base_filename; + + return $base_filename; +} + +=method parameters + + Usage : my $parameters = $job->parameters; + Purpose : Getter for parameters attribute + Returns : Any (usually arrayref or scalar) + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub parameters { + my ($self) = @_; + return $parameters{ id $self}; +} + +=method set_parameters + + Usage : $job->set_parameters($parameters); + Purpose : Setter for parameters attribute + Returns : undef + Parameters : Any (the parameters; usually arrayref or scalar) + Throws : No exceptions + Comments : None + +=cut + +sub set_parameters { + my ( $self, $arg ) = @_; + $parameters{ id $self} = $arg; + return; +} + +=method retries + + Usage : my $retries = $job->retries; + Purpose : Getter for retries attribute + Returns : +ve Int + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub retries { + my ($self) = @_; + return $retries{ id $self}; +} + +=method set_retries + + Usage : $job->set_retries(5); + Purpose : Setter for retries attribute + Returns : undef + Parameters : +ve Int (the retries) + Throws : No exceptions + Comments : None + +=cut + +sub set_retries { + my ( $self, $arg ) = @_; + $retries{ id $self} = _check_retries($arg); + return; +} + +# Usage : $retries = _check_retries($retries); +# Purpose : Check for valid retries +# Returns : +ve Int (the valid retries) +# Parameters : +ve Int (the retries) +# Throws : If retries is not a positive integer +# Comments : None +sub _check_retries { + my ($retries) = @_; + + confess "Invalid retries ($retries) specified" + if defined $retries && $retries !~ m/\A \d+ \z/xms; + + return $retries; +} + +=method memory + + Usage : my $memory = $job->memory; + Purpose : Getter for memory attribute + Returns : +ve Int + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub memory { + my ($self) = @_; + return $memory{ id $self}; +} + +=method set_memory + + Usage : $job->set_memory(1000); + Purpose : Setter for memory attribute + Returns : undef + Parameters : +ve Int (the memory) + Throws : No exceptions + Comments : None + +=cut + +sub set_memory { + my ( $self, $arg ) = @_; + $memory{ id $self} = _check_memory($arg); + return; +} + +# Usage : $memory = _check_memory($memory); +# Purpose : Check for valid memory +# Returns : +ve Int (the valid memory) +# Parameters : +ve Int (the memory) +# Throws : If memory is not a positive integer +# Comments : None +sub _check_memory { + my ($memory) = @_; + + confess "Invalid memory ($memory) specified" + if defined $memory && $memory !~ m/\A \d+ \z/xms; + + return $memory; +} + +=method status_code + + Usage : my $status_code = $job->status_code; + Purpose : Getter for the status code attribute + Returns : String (e.g. "DONE") + Parameters : None + Throws : No exceptions + Comments : Status code can be RUNNING, FAILED, DONE or NOT_RUN + +=cut + +sub status_code { + my ($self) = @_; + return $status_code{ id $self}; +} + +=method set_status_code + + Usage : $job->set_status_code('DONE'); + Purpose : Setter for the status code attribute + Returns : undef + Parameters : String (the status code) + Throws : No exceptions + Comments : None + +=cut + +sub set_status_code { + my ( $self, $arg ) = @_; + $status_code{ id $self} = _check_status_code($arg); + return; +} + +# Usage : $status_code = _check_status_code($status_code); +# Purpose : Check for valid status code +# Returns : String (the valid status code) +# Parameters : String (the status code) +# Throws : If status code is not valid +# Comments : None +sub _check_status_code { + my ($status_code) = @_; + + return $status_code + if defined $status_code + && ( $status_code eq 'RUNNING' + || $status_code eq 'FAILED' + || $status_code eq 'DONE' + || $status_code eq 'NOT_RUN' ); + confess 'No status code specified' if !defined $status_code; + confess "Invalid status code ($status_code) specified"; +} + +=method status_text + + Usage : my $status_text = $job->status_text; + Purpose : Getter for status text attribute + Returns : String (e.g. "Job killed by owner") + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub status_text { + my ($self) = @_; + return $status_text{ id $self}; +} + +=method set_status_text + + Usage : $job->set_status_text('Job killed by owner'); + Purpose : Setter for status text attribute + Returns : undef + Parameters : String (the status text) + Throws : No exceptions + Comments : None + +=cut + +sub set_status_text { + my ( $self, $arg ) = @_; + $status_text{ id $self} = $arg; + return; +} + +=method set_state_from_filesystem + + Usage : $job->set_state_from_filesystem(); + Purpose : Set state-related attributes of a job from filesystem + Returns : undef + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub set_state_from_filesystem { + my ($self) = @_; + + if ( $self->scheduler eq 'local' ) { + $self->_set_state_from_filesystem_for_local(); + } + elsif ( $self->scheduler eq 'lsf' ) { + $self->_set_state_from_filesystem_for_lsf(); + } + + return; +} + +# Usage : $self->_set_state_from_filesystem_for_local(); +# Purpose : Set state-related attributes of a job run locally (probably for +# testing) +# Returns : None +# Parameters : None +# Throws : No exceptions +# Comments : None +sub _set_state_from_filesystem_for_local { + my ($self) = @_; + + my $output_file = $self->base_filename . '.out'; + + my $job_file = $self->base_filename . '.job'; + + # Check if job has even been run yet + if ( !-e $job_file ) { + $self->set_status_code('NOT_RUN'); + return; + } + + # Get number of retries + my $yaml = LoadFile($job_file); + my $retries = $yaml->{retries}; + $self->set_retries($retries); + + if ( -e $output_file && !-z $output_file ) { + $self->set_status_code('DONE'); + } + elsif ( !-e $output_file || -z $output_file ) { + $self->set_status_code('FAILED'); + $self->set_status_text( 'Enpty output file: ' . $output_file ); + } + + return; +} + +# Usage : $self->_set_state_from_filesystem_for_lsf(); +# Purpose : Set state-related attributes of a job submitted to LSF +# Returns : None +# Parameters : None +# Throws : If job id in job file is not an integer +# If the status returned by bjobs is not recognised +# Comments : None +sub _set_state_from_filesystem_for_lsf { + my ($self) = @_; + + my $output_file = $self->base_filename . '.out'; + + my $job_file = $self->base_filename . '.job'; + + # Check if job has even been run yet + if ( !-e $job_file ) { + $self->set_status_code('NOT_RUN'); + return; + } + + # Get job id + my $yaml = LoadFile($job_file); + my $job_id = $yaml->{id}; + if ( $job_id !~ /\A \d+ \z/xms ) { + confess "Job ID ($job_id) not valid"; + } + + # Get number of retries + my $retries = $yaml->{retries}; + $self->set_retries($retries); + + # Get memory requested + my $memory = $yaml->{memory}; + $self->set_memory($memory); + + my ( $status_code, $status_text ); + + # Get job status for job id from bjobs command + my $lsf_status; + open my $pipe, q{-|}, 'bjobs ' . $job_id . ' 2>/dev/null'; # Hide STDERR + while ( my $job_line = <$pipe> ) { + if ( $job_line =~ m/\A $job_id \s+ \S+ \s+ (\S+)/xms ) { + $lsf_status = $1; + } + } + close $pipe; + if ($lsf_status) { + + # Got job status from bjobs + if ( !exists $STATUS_FOR{$lsf_status} ) { + confess "Unknown LSF status ($lsf_status)"; + } + $status_code = $STATUS_FOR{$lsf_status}; + $status_text = 'LSF status: ' . $lsf_status; + } + if ( !$status_code || $status_code eq 'FAILED' ) { + + # If bjobs doesn't return status or failed then check job's STDOUT + ( $status_code, $status_text ) = $self->_parse_lsf_stdout($job_id); + } + + $self->set_status_code($status_code); + $self->set_status_text($status_text); + + return; +} + +# Usage : ($status_code, $status_text) +# = $pipeline->_parse_lsf_stdout($job_id); +# Purpose : Parses LSF's STDOUT to get a job's status +# Returns : String (status code: DONE or FAILED) +# String (status info) or undef +# Parameters : Int (the job id) +# Throws : If STDOUT file can't be read +# Comments : Based on +# https://github.com/VertebrateResequencing/vr-pipe/blob/master/modules/VRPipe/Parser/lsf.pm +sub _parse_lsf_stdout { + my ( $self, $job_id ) = @_; + + # Check STDOUT file exists at all (in case job was killed whilst pending) + my $stdout_file = $self->base_filename . '.o'; + if ( !-e $stdout_file ) { + return 'FAILED', 'Job did not run'; + } + + # STDOUT file is overwritten so no need to read backwards to get last job + my ( $status_code, $status_text, $stdout_job_id ); + my $found_start = 0; + my $found_end = 0; + my $bw = File::ReadBackwards->new($stdout_file) + or confess "Can't read $stdout_file: $OS_ERROR"; + while ( defined( my $line = $bw->readline ) ) { + if ( $line =~ m/\A Resource \s usage \s summary: /xms ) { + $found_end = 1; + next; + } + elsif ( $line =~ m/\A Sender: \s LSF \s System/xms ) { + $found_start = 1; + last; # Will find start after end + } + elsif ($found_end) { + + # Get job id + if ( $line =~ m/\A Subject: \s Job \s (\d+):/xms ) { + $stdout_job_id = $1; + } + + # Get job's status code + if ( $line =~ m/\A Successfully \s completed[.] /xms ) { + $status_code = 'DONE'; + } + elsif ( !$status_code + && $line =~ + m/\A Exited \s with \s exit \s code \s (\d+) [.] /xms ) + { + $status_code = 'FAILED'; + $status_text = "Exit code: $1"; + } + elsif ( $line =~ m/\A TERM_ (\w+: .*) [.] /xms ) { + $status_code = 'FAILED'; + $status_text = $1; + } + } + } + + # Ensure correct job + if ( defined $stdout_job_id && $job_id != $stdout_job_id ) { + $status_code = 'FAILED'; + $status_text = "Wrong job id (expecting $job_id, got $stdout_job_id)"; + } + + # If no status then STDOUT could not be parsed + if ( !defined $status_code ) { + $status_code = 'FAILED'; + $status_text = "Could not parse job's STDOUT: $stdout_file"; + } + + return $status_code, $status_text; +} + +1; diff --git a/lib/DETCT/Pipeline/Stage.pm b/lib/DETCT/Pipeline/Stage.pm new file mode 100644 index 0000000..f626279 --- /dev/null +++ b/lib/DETCT/Pipeline/Stage.pm @@ -0,0 +1,236 @@ +## no critic (RequireUseStrict, RequireUseWarnings, RequireTidyCode) +package DETCT::Pipeline::Stage; +## use critic + +# ABSTRACT: Object representing a pipeline stage + +## Author : is1 +## Maintainer : is1 +## Created : 2013-01-09 +## Last commit by : $Author$ +## Last modified : $Date$ +## Revision : $Revision$ +## Repository URL : $HeadURL$ + +use warnings; +use strict; +use autodie; +use Carp; +use Try::Tiny; + +use Class::InsideOut qw( private register id ); + +=head1 SYNOPSIS + + # Brief code examples + +=cut + +# Attributes: +private name => my %name; # e.g. count_tags +private default_memory => my %default_memory; # e.g. 3000 +private all_jobs_run => my %all_jobs_run; # e.g. 1 +private prerequisite => my %prerequisite; # arrayref of stages + +=method new + + Usage : my $stage = DETCT::Pipeline::Stage->new( { + name => 'count_tags', + default_memory => 3000, + } ); + Purpose : Constructor for stage objects + Returns : DETCT::Pipeline::Stage + Parameters : Hashref { + name => String, + default_memory => Int, + all_jobs_run => Boolean or undef, + } + Throws : No exceptions + Comments : None + +=cut + +sub new { + my ( $class, $arg_ref ) = @_; + my $self = register($class); + $self->set_name( $arg_ref->{name} ); + $self->set_default_memory( $arg_ref->{default_memory} ); + $self->set_all_jobs_run( $arg_ref->{all_jobs_run} ); + return $self; +} + +=method name + + Usage : my $name = $sample->name; + Purpose : Getter for name attribute + Returns : String (e.g. "count_tags") + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub name { + my ($self) = @_; + return $name{ id $self}; +} + +=method set_name + + Usage : $sample->set_name('count_tags'); + Purpose : Setter for name attribute + Returns : undef + Parameters : String (the name) + Throws : No exceptions + Comments : None + +=cut + +sub set_name { + my ( $self, $arg ) = @_; + $name{ id $self} = _check_name($arg); + return; +} + +# Usage : $name = _check_name($name); +# Purpose : Check for valid name +# Returns : String (the valid name) +# Parameters : String (the name) +# Throws : If name is missing or invalid (i.e. not alphanumeric) +# Comments : None +sub _check_name { + my ($name) = @_; + + return $name if defined $name && $name =~ m/\A \w+ \z/xms; + confess 'No name specified' if !defined $name; + confess "Invalid name ($name) specified"; +} + +=method default_memory + + Usage : my $default_memory = $stage->default_memory; + Purpose : Getter for default memory attribute + Returns : +ve Int + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub default_memory { + my ($self) = @_; + return $default_memory{ id $self}; +} + +=method set_default_memory + + Usage : $stage->set_default_memory(3000); + Purpose : Setter for default memory attribute + Returns : undef + Parameters : +ve Int (the default memory) + Throws : No exceptions + Comments : None + +=cut + +sub set_default_memory { + my ( $self, $arg ) = @_; + $default_memory{ id $self} = _check_default_memory($arg); + return; +} + +# Usage : $default_memory = _check_default_memory($default_memory); +# Purpose : Check for valid default memory +# Returns : +ve Int (the valid default memory) +# Parameters : +ve Int (the default memory) +# Throws : If default memory is missing or not a positive integer +# Comments : None +sub _check_default_memory { + my ($default_memory) = @_; + return $default_memory + if defined $default_memory && $default_memory =~ m/\A \d+ \z/xms; + confess 'No default memory specified' if !defined $default_memory; + confess "Invalid default memory ($default_memory) specified"; +} + +=method all_jobs_run + + Usage : my $all_jobs_run = $stage->all_jobs_run; + Purpose : Getter for all jobs run flag + Returns : Boolean + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub all_jobs_run { + my ($self) = @_; + return $all_jobs_run{ id $self} || 0; +} + +=method set_all_jobs_run + + Usage : $stage->set_all_jobs_run(1); + Purpose : Setter for all jobs run flag + Returns : undef + Parameters : Boolean + Throws : No exceptions + Comments : None + +=cut + +sub set_all_jobs_run { + my ( $self, $arg ) = @_; + $all_jobs_run{ id $self} = $arg ? 1 : 0; + return; +} + +=method add_prerequisite + + Usage : $stage->add_prerequisite($prerequisite); + Purpose : Add a prerequisite to a stage + Returns : undef + Parameters : DETCT::Pipeline::Stage + Throws : If prerequisite is missing or invalid (i.e. not a + DETCT::Pipeline::Stage object) + Comments : None + +=cut + +sub add_prerequisite { + my ( $self, $prerequisite ) = @_; + + confess 'No prerequisite specified' if !defined $prerequisite; + confess 'Class of prerequisite (', ref $prerequisite, + ') not DETCT::Pipeline::Stage' + if !$prerequisite->isa('DETCT::Pipeline::Stage'); + + if ( !exists $prerequisite{ id $self} ) { + $prerequisite{ id $self} = [$prerequisite]; + } + else { + push @{ $prerequisite{ id $self} }, $prerequisite; + } + + return; +} + +=method get_all_prerequisites + + Usage : $prerequisites = $stage->get_all_prerequisites(); + Purpose : Get all prerequisites of a stage + Returns : Arrayref of DETCT::Pipeline::Stage objects + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub get_all_prerequisites { + my ($self) = @_; + + return $prerequisite{ id $self} || []; +} + +1; diff --git a/lib/DETCT/Pipeline/WithDiffExprStages.pm b/lib/DETCT/Pipeline/WithDiffExprStages.pm new file mode 100644 index 0000000..5f2161d --- /dev/null +++ b/lib/DETCT/Pipeline/WithDiffExprStages.pm @@ -0,0 +1,1236 @@ +## no critic (RequireUseStrict, RequireUseWarnings, RequireTidyCode) +package DETCT::Pipeline::WithDiffExprStages; +## use critic + +# ABSTRACT: Object representing a differential expression pipeline + +## Author : is1 +## Maintainer : is1 +## Created : 2013-01-16 +## Last commit by : $Author$ +## Last modified : $Date$ +## Revision : $Revision$ +## Repository URL : $HeadURL$ + +use warnings; +use strict; +use autodie; +use Carp; +use Try::Tiny; + +use parent qw(DETCT::Pipeline); + +use Class::InsideOut qw( private register id ); +use Scalar::Util qw( refaddr ); +use YAML::Tiny qw( DumpFile LoadFile ); +use DETCT::GeneFinder; +use DETCT::Misc::BAM qw( + count_tags + bin_reads + get_read_peaks + get_three_prime_ends + merge_three_prime_ends + filter_three_prime_ends + choose_three_prime_end + count_reads + merge_read_counts +); +use DETCT::Misc::PeakHMM qw( + merge_read_peaks + summarise_read_peaks + run_peak_hmm + join_hmm_bins +); +use DETCT::Misc::R qw( + run_deseq +); +use DETCT::Misc::Output qw( + dump_as_table +); + +=head1 SYNOPSIS + + # Brief code examples + +=cut + +=method all_parameters_by_bam_file_then_chunk + + Usage : all_parameters_by_bam_file_then_chunk(); + Purpose : Get all parameters for stage that requires jobs split up by BAM + file then by chunk + Returns : Array of arrayrefs + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub all_parameters_by_bam_file_then_chunk { + my ($self) = @_; + + my @all_parameters; + + my $chunks = $self->analysis->get_all_chunks(); + + foreach my $bam_file ( $self->analysis->list_all_bam_files() ) { + my @tags = $self->analysis->list_all_tags_by_bam_file($bam_file); + foreach my $chunk ( @{$chunks} ) { + push @all_parameters, [ $bam_file, $chunk, @tags ]; + } + } + + return @all_parameters; +} + +=method all_parameters_for_count_tags + + Usage : all_parameters_for_count_tags(); + Purpose : Get all parameters for count_tags stage + Returns : Array of arrayrefs + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub all_parameters_for_count_tags { + my ($self) = @_; + + return $self->all_parameters_by_bam_file_then_chunk(); +} + +=method run_count_tags + + Usage : run_count_tags(); + Purpose : Run function for count_tags stage + Returns : undef + Parameters : DETCT::Pipeline::Job + Throws : No exceptions + Comments : None + +=cut + +sub run_count_tags { + my ( $self, $job ) = @_; + + my ( $bam_file, $chunk, @tags ) = @{ $job->parameters }; + + my %chunk_count; + + # Get count for each sequence of a chunk separately and then merge + foreach my $seq ( @{$chunk} ) { + my $seq_count = count_tags( + { + bam_file => $bam_file, + mismatch_threshold => $self->analysis->mismatch_threshold, + seq_name => $seq->name, + tags => \@tags, + } + ); + %chunk_count = + %{ $self->hash_merge->merge( \%chunk_count, $seq_count ) }; + } + + my $output_file = $job->base_filename . '.out'; + + DumpFile( $output_file, \%chunk_count ); + + return; +} + +=method all_parameters_for_bin_reads + + Usage : all_parameters_for_bin_reads(); + Purpose : Get all parameters for bin_reads stage + Returns : Array of arrayrefs + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub all_parameters_for_bin_reads { + my ($self) = @_; + + return $self->all_parameters_by_bam_file_then_chunk(); +} + +=method run_bin_reads + + Usage : run_bin_reads(); + Purpose : Run function for bin_reads stage + Returns : undef + Parameters : DETCT::Pipeline::Job + Throws : No exceptions + Comments : None + +=cut + +sub run_bin_reads { + my ( $self, $job ) = @_; + + my ( $bam_file, $chunk, @tags ) = @{ $job->parameters }; + + my %chunk_bins; + + # Get bins for each sequence of a chunk separately and then merge + foreach my $seq ( @{$chunk} ) { + my $seq_bins = bin_reads( + { + bam_file => $bam_file, + mismatch_threshold => $self->analysis->mismatch_threshold, + bin_size => $self->analysis->bin_size, + seq_name => $seq->name, + tags => \@tags, + } + ); + %chunk_bins = %{ $self->hash_merge->merge( \%chunk_bins, $seq_bins ) }; + } + + my $output_file = $job->base_filename . '.out'; + + DumpFile( $output_file, \%chunk_bins ); + + return; +} + +=method all_parameters_for_get_read_peaks + + Usage : all_parameters_for_get_read_peaks(); + Purpose : Get all parameters for get_read_peaks stage + Returns : Array of arrayrefs + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub all_parameters_for_get_read_peaks { + my ($self) = @_; + + return $self->all_parameters_by_bam_file_then_chunk(); +} + +=method run_get_read_peaks + + Usage : run_get_read_peaks(); + Purpose : Run function for get_read_peaks stage + Returns : undef + Parameters : DETCT::Pipeline::Job + Throws : No exceptions + Comments : None + +=cut + +sub run_get_read_peaks { + my ( $self, $job ) = @_; + + my ( $bam_file, $chunk, @tags ) = @{ $job->parameters }; + + my %chunk_peaks; + + # Get read peaks for each sequence of a chunk separately and then merge + foreach my $seq ( @{$chunk} ) { + my $seq_peaks = get_read_peaks( + { + bam_file => $bam_file, + mismatch_threshold => $self->analysis->mismatch_threshold, + peak_buffer_width => $self->analysis->peak_buffer_width, + seq_name => $seq->name, + tags => \@tags, + } + ); + %chunk_peaks = + %{ $self->hash_merge->merge( \%chunk_peaks, $seq_peaks ) }; + } + + my $output_file = $job->base_filename . '.out'; + + DumpFile( $output_file, \%chunk_peaks ); + + return; +} + +=method all_parameters_for_merge_read_peaks + + Usage : all_parameters_for_merge_read_peaks(); + Purpose : Get all parameters for merge_read_peaks stage + Returns : Array of arrayrefs + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub all_parameters_for_merge_read_peaks { + my ($self) = @_; + + my @all_parameters; + + my $chunks = $self->analysis->get_all_chunks(); + + # Work out which get_read_peaks stage files need to be combined + foreach my $merge_chunk ( @{$chunks} ) { + my @get_read_peaks_output_files; + my $component = 0; + foreach my $bam_file ( $self->analysis->list_all_bam_files() ) { + foreach my $get_chunk ( @{$chunks} ) { + $component++; + if ( refaddr($merge_chunk) == refaddr($get_chunk) ) { + my $output_file = + $self->get_and_check_output_file( 'get_read_peaks', + $component ); + push @get_read_peaks_output_files, $output_file; + } + } + } + push @all_parameters, [ $merge_chunk, @get_read_peaks_output_files ]; + } + + return @all_parameters; +} + +=method run_merge_read_peaks + + Usage : run_merge_read_peaks(); + Purpose : Run function for merge_read_peaks stage + Returns : undef + Parameters : DETCT::Pipeline::Job + Throws : No exceptions + Comments : None + +=cut + +sub run_merge_read_peaks { + my ( $self, $job ) = @_; + + my ( $chunk, @get_read_peaks_output_files ) = @{ $job->parameters }; + + # Join lists of peaks + my %unmerged_peaks; + foreach my $output_file (@get_read_peaks_output_files) { + %unmerged_peaks = %{ + $self->hash_merge->merge( + \%unmerged_peaks, LoadFile($output_file) + ) + }; + } + + my %chunk_peaks; + + # Merge read peaks for each sequence of a chunk separately + foreach my $seq ( @{$chunk} ) { + my $seq_peaks = merge_read_peaks( + { + peak_buffer_width => $self->analysis->peak_buffer_width, + seq_name => $seq->name, + peaks => $unmerged_peaks{ $seq->name }, + } + ); + %chunk_peaks = + %{ $self->hash_merge->merge( \%chunk_peaks, $seq_peaks ) }; + } + + my $output_file = $job->base_filename . '.out'; + + DumpFile( $output_file, \%chunk_peaks ); + + return; +} + +=method all_parameters_for_summarise_read_peaks + + Usage : all_parameters_for_summarise_read_peaks(); + Purpose : Get all parameters for summarise_read_peaks stage + Returns : Array of arrayrefs + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub all_parameters_for_summarise_read_peaks { + my ($self) = @_; + + my @all_parameters; + + my $chunks = $self->analysis->get_all_chunks(); + + my $component = 0; + foreach my $chunk ( @{$chunks} ) { + $component++; + my $merge_read_peaks_output_file = + $self->get_and_check_output_file( 'merge_read_peaks', $component ); + push @all_parameters, [ $chunk, $merge_read_peaks_output_file ]; + } + + return @all_parameters; +} + +=method run_summarise_read_peaks + + Usage : run_summarise_read_peaks(); + Purpose : Run function for summarise_read_peaks stage + Returns : undef + Parameters : DETCT::Pipeline::Job + Throws : No exceptions + Comments : None + +=cut + +sub run_summarise_read_peaks { + my ( $self, $job ) = @_; + + my ( $chunk, $merge_read_peaks_output_file ) = @{ $job->parameters }; + + # Get merged peaks + my %merged_peaks = %{ LoadFile($merge_read_peaks_output_file) }; + + my %chunk_summary; + + # Summarise read peaks for each sequence of a chunk separately + foreach my $seq ( @{$chunk} ) { + my $seq_summary = summarise_read_peaks( + { + bin_size => $self->analysis->bin_size, + peak_buffer_width => $self->analysis->peak_buffer_width, + hmm_sig_level => $self->analysis->hmm_sig_level, + seq_name => $seq->name, + seq_bp => $seq->bp, + read_length => $self->analysis->read2_length, + peaks => $merged_peaks{ $seq->name }, + } + ); + %chunk_summary = + %{ $self->hash_merge->merge( \%chunk_summary, $seq_summary ) }; + } + + my $output_file = $job->base_filename . '.out'; + + DumpFile( $output_file, \%chunk_summary ); + + return; +} + +=method all_parameters_for_run_peak_hmm + + Usage : all_parameters_for_run_peak_hmm(); + Purpose : Get all parameters for run_peak_hmm stage + Returns : Array of arrayrefs + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub all_parameters_for_run_peak_hmm { + my ($self) = @_; + + my @all_parameters; + + my $chunks = $self->analysis->get_all_chunks(); + + # Work out which bin_reads stage files need to be combined + my $component = 0; + foreach my $hmm_chunk ( @{$chunks} ) { + $component++; + my @bin_reads_output_files; + my $bin_component = 0; + foreach my $bam_file ( $self->analysis->list_all_bam_files() ) { + foreach my $bin_chunk ( @{$chunks} ) { + $bin_component++; + if ( refaddr($hmm_chunk) == refaddr($bin_chunk) ) { + my $bin_output_file = + $self->get_and_check_output_file( 'bin_reads', + $bin_component ); + push @bin_reads_output_files, $bin_output_file; + } + } + } + my $summary_output_file = + $self->get_and_check_output_file( 'summarise_read_peaks', + $component ); + push @all_parameters, + [ $hmm_chunk, $summary_output_file, @bin_reads_output_files ]; + } + + return @all_parameters; +} + +=method run_run_peak_hmm + + Usage : run_run_peak_hmm(); + Purpose : Run function for run_peak_hmm stage + Returns : undef + Parameters : DETCT::Pipeline::Job + Throws : No exceptions + Comments : None + +=cut + +sub run_run_peak_hmm { + my ( $self, $job ) = @_; + + my ( $chunk, $summary_output_file, @bin_reads_output_files ) = + @{ $job->parameters }; + + # Join read bins + my %read_bins; + foreach my $output_file (@bin_reads_output_files) { + %read_bins = + %{ $self->hash_merge->merge( \%read_bins, LoadFile($output_file) ) }; + } + + # Load summary + my $summary = LoadFile($summary_output_file); + + my %chunk_hmm; + + # Run peak HMM for each sequence of a chunk separately + foreach my $seq ( @{$chunk} ) { + my $seq_hmm = run_peak_hmm( + { + dir => $job->base_filename, + hmm_sig_level => $self->analysis->hmm_sig_level, + seq_name => $seq->name, + read_bins => $read_bins{ $seq->name }, + summary => $summary->{ $seq->name }, + hmm_binary => $self->analysis->hmm_binary, + } + ); + %chunk_hmm = %{ $self->hash_merge->merge( \%chunk_hmm, $seq_hmm ) }; + } + + my $output_file = $job->base_filename . '.out'; + + DumpFile( $output_file, \%chunk_hmm ); + + return; +} + +=method all_parameters_for_join_hmm_bins + + Usage : all_parameters_for_join_hmm_bins(); + Purpose : Get all parameters for join_hmm_bins stage + Returns : Array of arrayrefs + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub all_parameters_for_join_hmm_bins { + my ($self) = @_; + + my @all_parameters; + + my $chunks = $self->analysis->get_all_chunks(); + + my $component = 0; + foreach my $chunk ( @{$chunks} ) { + $component++; + my $run_peak_hmm_output_file = + $self->get_and_check_output_file( 'run_peak_hmm', $component ); + push @all_parameters, [ $chunk, $run_peak_hmm_output_file ]; + } + + return @all_parameters; +} + +=method run_join_hmm_bins + + Usage : run_join_hmm_bins(); + Purpose : Run function for join_hmm_bins stage + Returns : undef + Parameters : DETCT::Pipeline::Job + Throws : No exceptions + Comments : None + +=cut + +sub run_join_hmm_bins { + my ( $self, $job ) = @_; + + my ( $chunk, $run_peak_hmm_output_file ) = @{ $job->parameters }; + + # Get HMM bins + my $hmm_bins = LoadFile($run_peak_hmm_output_file); + + my %chunk_regions; + + # Join HMM bins for each sequence of a chunk separately + foreach my $seq ( @{$chunk} ) { + my $seq_regions = join_hmm_bins( + { + bin_size => $self->analysis->bin_size, + seq_name => $seq->name, + hmm_bins => $hmm_bins->{ $seq->name }, + } + ); + %chunk_regions = + %{ $self->hash_merge->merge( \%chunk_regions, $seq_regions ) }; + } + + my $output_file = $job->base_filename . '.out'; + + DumpFile( $output_file, \%chunk_regions ); + + return; +} + +=method all_parameters_for_get_three_prime_ends + + Usage : all_parameters_for_get_three_prime_ends(); + Purpose : Get all parameters for get_three_prime_ends stage + Returns : Array of arrayrefs + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub all_parameters_for_get_three_prime_ends { + my ($self) = @_; + + my @all_parameters; + + my $chunks = $self->analysis->get_all_chunks(); + + foreach my $bam_file ( $self->analysis->list_all_bam_files() ) { + my @tags = $self->analysis->list_all_tags_by_bam_file($bam_file); + my $component = 0; + foreach my $chunk ( @{$chunks} ) { + $component++; + my $join_hmm_bins_output_file = + $self->get_and_check_output_file( 'join_hmm_bins', $component ); + push @all_parameters, + [ $chunk, $bam_file, $join_hmm_bins_output_file, @tags ]; + } + } + + return @all_parameters; +} + +=method run_get_three_prime_ends + + Usage : run_get_three_prime_ends(); + Purpose : Run function for get_three_prime_ends stage + Returns : undef + Parameters : DETCT::Pipeline::Job + Throws : No exceptions + Comments : None + +=cut + +sub run_get_three_prime_ends { + my ( $self, $job ) = @_; + + my ( $chunk, $bam_file, $join_hmm_bins_output_file, @tags ) = + @{ $job->parameters }; + + # Get regions + my $regions = LoadFile($join_hmm_bins_output_file); + + my %chunk_regions; + + # Get 3' ends for each sequence of a chunk separately + foreach my $seq ( @{$chunk} ) { + my $seq_regions = get_three_prime_ends( + { + bam_file => $bam_file, + mismatch_threshold => $self->analysis->mismatch_threshold, + seq_name => $seq->name, + tags => \@tags, + regions => $regions->{ $seq->name }, + } + ); + %chunk_regions = + %{ $self->hash_merge->merge( \%chunk_regions, $seq_regions ) }; + } + + my $output_file = $job->base_filename . '.out'; + + DumpFile( $output_file, \%chunk_regions ); + + return; +} + +=method all_parameters_for_merge_three_prime_ends + + Usage : all_parameters_for_merge_three_prime_ends(); + Purpose : Get all parameters for merge_three_prime_ends stage + Returns : Array of arrayrefs + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub all_parameters_for_merge_three_prime_ends { + my ($self) = @_; + + my @all_parameters; + + my $chunks = $self->analysis->get_all_chunks(); + + # Work out which get_three_prime_ends stage files need to be merged + foreach my $merge_chunk ( @{$chunks} ) { + my @get_three_prime_ends_output_files; + my $component = 0; + foreach my $bam_file ( $self->analysis->list_all_bam_files() ) { + foreach my $run_chunk ( @{$chunks} ) { + $component++; + if ( refaddr($merge_chunk) == refaddr($run_chunk) ) { + my $output_file = + $self->get_and_check_output_file( 'get_three_prime_ends', + $component ); + push @get_three_prime_ends_output_files, $output_file; + } + } + } + push @all_parameters, + [ $merge_chunk, @get_three_prime_ends_output_files ]; + } + + return @all_parameters; +} + +=method run_merge_three_prime_ends + + Usage : run_merge_three_prime_ends(); + Purpose : Run function for merge_three_prime_ends stage + Returns : undef + Parameters : DETCT::Pipeline::Job + Throws : No exceptions + Comments : None + +=cut + +sub run_merge_three_prime_ends { + my ( $self, $job ) = @_; + + my ( $chunk, @get_three_prime_ends_output_files ) = @{ $job->parameters }; + + # Load all regions + my @list_of_lists_of_regions; + foreach my $output_file (@get_three_prime_ends_output_files) { + my $regions = LoadFile($output_file); + push @list_of_lists_of_regions, $regions; + } + + my %chunk_regions; + + # Merge 3' ends for each sequence of a chunk separately + foreach my $seq ( @{$chunk} ) { + my @regions = map { $_->{ $seq->name } } @list_of_lists_of_regions; + my $seq_regions = merge_three_prime_ends( + { + seq_name => $seq->name, + regions => \@regions, + } + ); + %chunk_regions = + %{ $self->hash_merge->merge( \%chunk_regions, $seq_regions ) }; + } + + my $output_file = $job->base_filename . '.out'; + + DumpFile( $output_file, \%chunk_regions ); + + return; +} + +=method all_parameters_for_filter_three_prime_ends + + Usage : all_parameters_for_filter_three_prime_ends(); + Purpose : Get all parameters for filter_three_prime_ends stage + Returns : Array of arrayrefs + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub all_parameters_for_filter_three_prime_ends { + my ($self) = @_; + + my @all_parameters; + + my $chunks = $self->analysis->get_all_chunks(); + + my $component = 0; + foreach my $chunk ( @{$chunks} ) { + $component++; + my $merge_three_prime_ends_output_file = + $self->get_and_check_output_file( 'merge_three_prime_ends', + $component ); + push @all_parameters, [ $chunk, $merge_three_prime_ends_output_file ]; + } + + return @all_parameters; +} + +=method run_filter_three_prime_ends + + Usage : run_filter_three_prime_ends(); + Purpose : Run function for filter_three_prime_ends stage + Returns : undef + Parameters : DETCT::Pipeline::Job + Throws : No exceptions + Comments : None + +=cut + +sub run_filter_three_prime_ends { + my ( $self, $job ) = @_; + + my ( $chunk, $merge_three_prime_ends_output_file ) = @{ $job->parameters }; + + # Get regions + my $regions = LoadFile($merge_three_prime_ends_output_file); + + my %chunk_regions; + + # Filter 3' ends for each sequence of a chunk separately + foreach my $seq ( @{$chunk} ) { + my $seq_regions = filter_three_prime_ends( + { + analysis => $self->analysis, + seq_name => $seq->name, + regions => $regions->{ $seq->name }, + } + ); + %chunk_regions = + %{ $self->hash_merge->merge( \%chunk_regions, $seq_regions ) }; + } + + my $output_file = $job->base_filename . '.out'; + + DumpFile( $output_file, \%chunk_regions ); + + return; +} + +=method all_parameters_for_choose_three_prime_end + + Usage : all_parameters_for_choose_three_prime_end(); + Purpose : Get all parameters for choose_three_prime_end stage + Returns : Array of arrayrefs + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub all_parameters_for_choose_three_prime_end { + my ($self) = @_; + + my @all_parameters; + + my $chunks = $self->analysis->get_all_chunks(); + + my $component = 0; + foreach my $chunk ( @{$chunks} ) { + $component++; + my $filter_three_prime_ends_output_file = + $self->get_and_check_output_file( 'filter_three_prime_ends', + $component ); + push @all_parameters, [ $chunk, $filter_three_prime_ends_output_file ]; + } + + return @all_parameters; +} + +=method run_choose_three_prime_end + + Usage : run_choose_three_prime_end(); + Purpose : Run function for choose_three_prime_end stage + Returns : undef + Parameters : DETCT::Pipeline::Job + Throws : No exceptions + Comments : None + +=cut + +sub run_choose_three_prime_end { + my ( $self, $job ) = @_; + + my ( $chunk, $filter_three_prime_ends_output_file ) = @{ $job->parameters }; + + # Get regions + my $regions = LoadFile($filter_three_prime_ends_output_file); + + my %chunk_regions; + + # Choose 3' ends for each sequence of a chunk separately + foreach my $seq ( @{$chunk} ) { + my $seq_regions = choose_three_prime_end( + { + seq_name => $seq->name, + regions => $regions->{ $seq->name }, + } + ); + %chunk_regions = + %{ $self->hash_merge->merge( \%chunk_regions, $seq_regions ) }; + } + + my $output_file = $job->base_filename . '.out'; + + DumpFile( $output_file, \%chunk_regions ); + + return; +} + +=method all_parameters_for_count_reads + + Usage : all_parameters_for_count_reads(); + Purpose : Get all parameters for count_reads stage + Returns : Array of arrayrefs + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub all_parameters_for_count_reads { + my ($self) = @_; + + my @all_parameters; + + my $chunks = $self->analysis->get_all_chunks(); + + foreach my $bam_file ( $self->analysis->list_all_bam_files() ) { + my @tags = $self->analysis->list_all_tags_by_bam_file($bam_file); + my $component = 0; + foreach my $chunk ( @{$chunks} ) { + $component++; + my $choose_three_prime_end_output_file = + $self->get_and_check_output_file( 'choose_three_prime_end', + $component ); + push @all_parameters, + [ $chunk, $bam_file, $choose_three_prime_end_output_file, @tags ]; + } + } + + return @all_parameters; +} + +=method run_count_reads + + Usage : run_count_reads(); + Purpose : Run function for count_reads stage + Returns : undef + Parameters : DETCT::Pipeline::Job + Throws : No exceptions + Comments : None + +=cut + +sub run_count_reads { + my ( $self, $job ) = @_; + + my ( $chunk, $bam_file, $choose_three_prime_end_output_file, @tags ) = + @{ $job->parameters }; + + # Get regions + my $regions = LoadFile($choose_three_prime_end_output_file); + + my %chunk_regions; + + # Count reads for each sequence of a chunk separately + foreach my $seq ( @{$chunk} ) { + my $seq_regions = count_reads( + { + bam_file => $bam_file, + mismatch_threshold => $self->analysis->mismatch_threshold, + seq_name => $seq->name, + regions => $regions->{ $seq->name }, + tags => \@tags, + } + ); + %chunk_regions = + %{ $self->hash_merge->merge( \%chunk_regions, $seq_regions ) }; + } + + my $output_file = $job->base_filename . '.out'; + + DumpFile( $output_file, \%chunk_regions ); + + return; +} + +=method all_parameters_for_merge_read_counts + + Usage : all_parameters_for_merge_read_counts(); + Purpose : Get all parameters for merge_read_counts stage + Returns : Array of arrayrefs + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub all_parameters_for_merge_read_counts { + my ($self) = @_; + + my @all_parameters; + + my $chunks = $self->analysis->get_all_chunks(); + + # Work out which count_reads stage files need to be merged + foreach my $merge_chunk ( @{$chunks} ) { + my %output_file_for; + my $component = 0; + foreach my $bam_file ( $self->analysis->list_all_bam_files() ) { + foreach my $run_chunk ( @{$chunks} ) { + $component++; + if ( refaddr($merge_chunk) == refaddr($run_chunk) ) { + my $output_file = + $self->get_and_check_output_file( 'count_reads', + $component ); + $output_file_for{$bam_file} = $output_file; + } + } + } + push @all_parameters, [ $merge_chunk, %output_file_for ]; + } + + return @all_parameters; +} + +=method run_merge_read_counts + + Usage : run_merge_read_counts(); + Purpose : Run function for merge_read_counts stage + Returns : undef + Parameters : DETCT::Pipeline::Job + Throws : No exceptions + Comments : None + +=cut + +sub run_merge_read_counts { + my ( $self, $job ) = @_; + + my ( $chunk, %output_file_for ) = @{ $job->parameters }; + + # Load all regions + my %hash_of_lists_of_regions; + foreach my $bam_file ( keys %output_file_for ) { + my $regions = LoadFile( $output_file_for{$bam_file} ); + $hash_of_lists_of_regions{$bam_file} = $regions; + } + + my %chunk_regions; + + # Merge read counts for each sequence of a chunk separately + foreach my $seq ( @{$chunk} ) { + + # Hash keyed by BAM file + my %regions = + map { $_ => $hash_of_lists_of_regions{$_}->{ $seq->name } } + keys %hash_of_lists_of_regions; + my $seq_regions = merge_read_counts( + { + seq_name => $seq->name, + regions => \%regions, + samples => $self->analysis->get_all_samples(), + } + ); + %chunk_regions = + %{ $self->hash_merge->merge( \%chunk_regions, $seq_regions ) }; + } + + my $output_file = $job->base_filename . '.out'; + + DumpFile( $output_file, \%chunk_regions ); + + return; +} + +=method all_parameters_for_run_deseq + + Usage : all_parameters_for_run_deseq(); + Purpose : Get all parameters for run_deseq stage + Returns : Arrayref + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub all_parameters_for_run_deseq { + my ($self) = @_; + + my @all_parameters; + + my $chunks = $self->analysis->get_all_chunks(); + + my @merge_read_counts_output_files; + my $component = 0; + foreach my $chunk ( @{$chunks} ) { + $component++; + push @merge_read_counts_output_files, + $self->get_and_check_output_file( 'merge_read_counts', $component ); + } + push @all_parameters, \@merge_read_counts_output_files; + + return @all_parameters; +} + +=method run_run_deseq + + Usage : run_run_deseq(); + Purpose : Run function for run_deseq stage + Returns : undef + Parameters : DETCT::Pipeline::Job + Throws : No exceptions + Comments : None + +=cut + +sub run_run_deseq { + my ( $self, $job ) = @_; + + my (@merge_read_counts_output_files) = @{ $job->parameters }; + + # Join regions + my %regions; + foreach my $output_file (@merge_read_counts_output_files) { + %regions = + %{ $self->hash_merge->merge( \%regions, LoadFile($output_file) ) }; + } + + my $regions_ref = run_deseq( + { + dir => $job->base_filename, + regions => \%regions, + samples => $self->analysis->get_all_samples(), + r_binary => $self->analysis->r_binary, + deseq_script => $self->analysis->deseq_script, + } + ); + + my $output_file = $job->base_filename . '.out'; + + DumpFile( $output_file, $regions_ref ); + + return; +} + +=method all_parameters_for_add_gene_annotation + + Usage : all_parameters_for_add_gene_annotation(); + Purpose : Get all parameters for add_gene_annotation stage + Returns : Arrayref + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub all_parameters_for_add_gene_annotation { + my ($self) = @_; + + my @all_parameters; + + my $run_deseq_output_file = + $self->get_and_check_output_file( 'run_deseq', 1 ); + + push @all_parameters, [$run_deseq_output_file]; + + return @all_parameters; +} + +=method run_add_gene_annotation + + Usage : run_add_gene_annotation(); + Purpose : Run function for add_gene_annotation stage + Returns : undef + Parameters : DETCT::Pipeline::Job + Throws : No exceptions + Comments : None + +=cut + +sub run_add_gene_annotation { + my ( $self, $job ) = @_; + + my ($run_deseq_output_file) = @{ $job->parameters }; + + # Get regions + my $regions = LoadFile($run_deseq_output_file); + + # Annotate 3' ends with genes + # Could split regions by chunk if slow + my $gene_finder = DETCT::GeneFinder->new( + { slice_adaptor => $self->analysis->slice_adaptor, } ); + my $annotated_regions_ref = $gene_finder->add_gene_annotation($regions); + + my $output_file = $job->base_filename . '.out'; + + DumpFile( $output_file, $annotated_regions_ref ); + + return; +} + +=method all_parameters_for_dump_as_table + + Usage : all_parameters_for_dump_as_table(); + Purpose : Get all parameters for dump_as_table stage + Returns : Arrayref + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub all_parameters_for_dump_as_table { + my ($self) = @_; + + my @all_parameters; + + my $add_gene_annotation_output_file = + $self->get_and_check_output_file( 'add_gene_annotation', 1 ); + + push @all_parameters, [$add_gene_annotation_output_file]; + + return @all_parameters; +} + +=method run_dump_as_table + + Usage : run_dump_as_table(); + Purpose : Run function for dump_as_table stage + Returns : undef + Parameters : DETCT::Pipeline::Job + Throws : No exceptions + Comments : None + +=cut + +sub run_dump_as_table { + my ( $self, $job ) = @_; + + my ($add_gene_annotation_output_file) = @{ $job->parameters }; + + # Get regions + my $regions = LoadFile($add_gene_annotation_output_file); + + DETCT::Misc::Output::dump_as_table( + { + analysis => $self->analysis, + dir => $job->base_filename, + regions => $regions, + } + ); + + my $output_file = $job->base_filename . '.out'; + + DumpFile( $output_file, 1 ); + + return; +} + +1; diff --git a/lib/DETCT/Sample.pm b/lib/DETCT/Sample.pm new file mode 100644 index 0000000..042a235 --- /dev/null +++ b/lib/DETCT/Sample.pm @@ -0,0 +1,367 @@ +## no critic (RequireUseStrict, RequireUseWarnings, RequireTidyCode) +package DETCT::Sample; +## use critic + +# ABSTRACT: Object representing a sample + +## Author : is1 +## Maintainer : is1 +## Created : 2012-09-19 +## Last commit by : $Author$ +## Last modified : $Date$ +## Revision : $Revision$ +## Repository URL : $HeadURL$ + +use warnings; +use strict; +use autodie; +use Carp; +use Try::Tiny; + +use Readonly; +use Class::InsideOut qw( private register id ); + +=head1 SYNOPSIS + + # Brief code examples + +=cut + +# Attributes: +private name => my %name; # e.g. zmp_ph1_1m +private description => my %description; # e.g. ZMP phenotype 1.1 mutant +private condition => my %condition; # e.g. mutant +private group => my %group; # e.g. 1 +private tag => my %tag; # e.g. NNNNBGAGGC +private bam_file => my %bam_file; # e.g. 8295_6#1.bam + +# Constants +Readonly our $MAX_NAME_LENGTH => 128; +Readonly our $MAX_CONDITION_LENGTH => 128; +Readonly our $MAX_GROUP_LENGTH => 128; + +=method new + + Usage : my $sample = DETCT::Sample->new( { + name => 'zmp_ph1_1m', + condition => 'mutant', + group => '1', + tag => 'NNNNBGAGGC', + bam_file => '8295_6#1.bam', + } ); + Purpose : Constructor for sample objects + Returns : DETCT::Sample + Parameters : Hashref { + name => String, + description => String or undef, + condition => String, + group => String or undef, + tag => String, + bam_file => String, + } + Throws : No exceptions + Comments : None + +=cut + +sub new { + my ( $class, $arg_ref ) = @_; + my $self = register($class); + $self->set_name( $arg_ref->{name} ); + $self->set_description( $arg_ref->{description} ); + $self->set_condition( $arg_ref->{condition} ); + $self->set_group( $arg_ref->{group} ); + $self->set_tag( $arg_ref->{tag} ); + $self->set_bam_file( $arg_ref->{bam_file} ); + return $self; +} + +=method name + + Usage : my $name = $sample->name; + Purpose : Getter for name attribute + Returns : String (e.g. "zmp_ph1_1m") + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub name { + my ($self) = @_; + return $name{ id $self}; +} + +=method set_name + + Usage : $sample->set_name('zmp_ph1_1m'); + Purpose : Setter for name attribute + Returns : undef + Parameters : String (the name) + Throws : No exceptions + Comments : None + +=cut + +sub set_name { + my ( $self, $arg ) = @_; + $name{ id $self} = _check_name($arg); + return; +} + +# Usage : $name = _check_name($name); +# Purpose : Check for valid name +# Returns : String (the valid name) +# Parameters : String (the name) +# Throws : If name is missing +# If name is invalid (i.e. not alphanumeric) +# If name is empty +# If name > $MAX_NAME_LENGTH characters +# Comments : None +sub _check_name { + my ($name) = @_; + + confess 'No name specified' if !defined $name; + confess 'Empty name specified' if !length $name; + confess 'Invalid name specified' if $name !~ m/\A [\w.-]+ \z/xms; + confess "Name ($name) longer than $MAX_NAME_LENGTH characters" + if length $name > $MAX_NAME_LENGTH; + + return $name; +} + +=method description + + Usage : my $description = $sample->description; + Purpose : Getter for description attribute + Returns : String (e.g. "ZMP phenotype 1.1 mutant") + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub description { + my ($self) = @_; + return $description{ id $self}; +} + +=method set_description + + Usage : $sample->set_description('ZMP phenotype 1.1 mutant'); + Purpose : Setter for description attribute + Returns : undef + Parameters : String (the description) + Throws : No exceptions + Comments : None + +=cut + +sub set_description { + my ( $self, $arg ) = @_; + $description{ id $self} = $arg; + return; +} + +=method condition + + Usage : my $condition = $sample->condition; + Purpose : Getter for condition attribute + Returns : String (e.g. "mutant") + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub condition { + my ($self) = @_; + return $condition{ id $self}; +} + +=method set_condition + + Usage : $sample->set_condition('mutant'); + Purpose : Setter for condition attribute + Returns : undef + Parameters : String (the condition) + Throws : No exceptions + Comments : None + +=cut + +sub set_condition { + my ( $self, $arg ) = @_; + $condition{ id $self} = _check_condition($arg); + return; +} + +# Usage : $condition = _check_condition($condition); +# Purpose : Check for valid condition +# Returns : String (the valid condition) +# Parameters : String (the condition) +# Throws : If condition is missing +# If condition is empty +# If condition > $MAX_GROUP_LENGTH characters +# Comments : None +sub _check_condition { + my ($condition) = @_; + + confess 'No condition specified' if !defined $condition; + confess 'Empty condition specified' if !length $condition; + confess + "Condition ($condition) longer than $MAX_CONDITION_LENGTH characters" + if length $condition > $MAX_CONDITION_LENGTH; + + return $condition; +} + +=method group + + Usage : my $group = $sample->group; + Purpose : Getter for group attribute + Returns : String (e.g. "1") + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub group { + my ($self) = @_; + return $group{ id $self}; +} + +=method set_group + + Usage : $sample->set_group('1'); + Purpose : Setter for group attribute + Returns : undef + Parameters : String (the group) + Throws : No exceptions + Comments : None + +=cut + +sub set_group { + my ( $self, $arg ) = @_; + $group{ id $self} = _check_group($arg); + return; +} + +# Usage : $group = _check_group($group); +# Purpose : Check for valid group +# Returns : String (the valid group) +# Parameters : String (the group) +# Throws : If group is empty +# If group > $MAX_GROUP_LENGTH characters +# Comments : None +sub _check_group { + my ($group) = @_; + + confess 'Empty group specified' if defined $group && !length $group; + confess "Group ($group) longer than $MAX_GROUP_LENGTH characters" + if defined $group && length $group > $MAX_GROUP_LENGTH; + + return $group; +} + +=method tag + + Usage : my $tag = $sample->tag; + Purpose : Getter for tag attribute + Returns : String (e.g. "NNNNBGAGGC") + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub tag { + my ($self) = @_; + return $tag{ id $self}; +} + +=method set_tag + + Usage : $sample->set_tag('NNNNBGAGGC'); + Purpose : Setter for tag attribute + Returns : undef + Parameters : String (the tag) + Throws : No exceptions + Comments : None + +=cut + +sub set_tag { + my ( $self, $arg ) = @_; + $tag{ id $self} = _check_tag($arg); + return; +} + +# Usage : $tag = _check_tag($tag); +# Purpose : Check for valid tag +# Returns : String (the valid tag) +# Parameters : String (the tag) +# Throws : If tag is missing or invalid +# Comments : None +sub _check_tag { + my ($tag) = @_; + return $tag + if defined $tag && $tag =~ m/\A [NRYKMSWBDHV]+ [AGCT]+ \z/xms; + confess 'No tag specified' if !defined $tag; + confess "Invalid tag ($tag) specified"; +} + +=method bam_file + + Usage : my $bam_file = $sample->bam_file; + Purpose : Getter for BAM file attribute + Returns : String (e.g. "8295_6#1.bam") + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub bam_file { + my ($self) = @_; + return $bam_file{ id $self}; +} + +=method set_bam_file + + Usage : $sample->set_bam('8295_6#1.bam'); + Purpose : Setter for BAM file attribute + Returns : undef + Parameters : String (the BAM file) + Throws : No exceptions + Comments : None + +=cut + +sub set_bam_file { + my ( $self, $arg ) = @_; + $bam_file{ id $self} = check_bam_file($arg); + return; +} + +=method check_bam_file + + Usage : $bam_file = check_bam_file($bam_file); + Purpose : Check for valid BAM file + Returns : String (the valid BAM file) + Parameters : String (the BAM file) + Throws : If BAM file is missing or not readable + Comments : None + +=cut + +sub check_bam_file { + my ($bam_file) = @_; + return $bam_file if defined $bam_file && -r $bam_file; + confess 'No BAM file specified' if !defined $bam_file; + confess "BAM file ($bam_file) does not exist or cannot be read"; +} + +1; diff --git a/lib/DETCT/Sequence.pm b/lib/DETCT/Sequence.pm new file mode 100644 index 0000000..ff80064 --- /dev/null +++ b/lib/DETCT/Sequence.pm @@ -0,0 +1,161 @@ +## no critic (RequireUseStrict, RequireUseWarnings, RequireTidyCode) +package DETCT::Sequence; +## use critic + +# ABSTRACT: Object representing a sequence (a component of a reference sequence) + +## Author : is1 +## Maintainer : is1 +## Created : 2012-09-21 +## Last commit by : $Author$ +## Last modified : $Date$ +## Revision : $Revision$ +## Repository URL : $HeadURL$ + +use warnings; +use strict; +use autodie; +use Carp; +use Try::Tiny; + +use Readonly; +use Class::InsideOut qw( private register id ); + +=head1 SYNOPSIS + + # Brief code examples + +=cut + +# Attributes: +private name => my %name; # e.g. 1 +private bp => my %bp; # e.g. 60348388 + +# Constants +Readonly our $MAX_NAME_LENGTH => 128; + +=method new + + Usage : my $sequence = DETCT::Sequence->new( { + name => '1', + bp => 60_348_388, + } ); + Purpose : Constructor for sequence objects + Returns : DETCT::Sequence + Parameters : Hashref { + name => String, + bp => Int, + } + Throws : No exceptions + Comments : None + +=cut + +sub new { + my ( $class, $arg_ref ) = @_; + my $self = register($class); + $self->set_name( $arg_ref->{name} ); + $self->set_bp( $arg_ref->{bp} ); + return $self; +} + +=method name + + Usage : my $name = $sequence->name; + Purpose : Getter for name attribute + Returns : String (e.g. "1") + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub name { + my ($self) = @_; + return $name{ id $self}; +} + +=method set_name + + Usage : $sequence->set_name('1'); + Purpose : Setter for name attribute + Returns : undef + Parameters : String (the name) + Throws : No exceptions + Comments : None + +=cut + +sub set_name { + my ( $self, $arg ) = @_; + $name{ id $self} = _check_name($arg); + return; +} + +# Usage : $name = _check_name($name); +# Purpose : Check for valid name +# Returns : String (the valid name) +# Parameters : String (the name) +# Throws : If name is missing +# If name is empty +# If name > $MAX_NAME_LENGTH characters +# Comments : None +sub _check_name { + my ($name) = @_; + + confess 'No name specified' if !defined $name; + confess 'Empty name specified' if !length $name; + confess "Name ($name) longer than $MAX_NAME_LENGTH characters" + if length $name > $MAX_NAME_LENGTH; + + return $name; +} + +=method bp + + Usage : my $bp = $sequence->bp; + Purpose : Getter for bp attribute + Returns : +ve Int + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub bp { + my ($self) = @_; + return $bp{ id $self}; +} + +=method set_bp + + Usage : $sequence->set_bp(40352744); + Purpose : Setter for bp attribute + Returns : undef + Parameters : +ve Int (bp) + Throws : No exceptions + Comments : None + +=cut + +sub set_bp { + my ( $self, $arg ) = @_; + $bp{ id $self} = _check_bp($arg); + return; +} + +# Usage : $bp = _check_bp($bp); +# Purpose : Check for valid bp +# Returns : +ve Int (valid bp) +# Parameters : +ve Int (bp) +# Throws : If bp is missing or not a positive integer +# Comments : None +sub _check_bp { + my ($bp) = @_; + return $bp + if defined $bp && $bp =~ m/\A \d+ \z/xms; + confess 'No bp specified' if !defined $bp; + confess "Invalid bp ($bp) specified"; +} + +1; diff --git a/lib/DETCT/Transcript.pm b/lib/DETCT/Transcript.pm new file mode 100644 index 0000000..b7a406e --- /dev/null +++ b/lib/DETCT/Transcript.pm @@ -0,0 +1,516 @@ +## no critic (RequireUseStrict, RequireUseWarnings, RequireTidyCode) +package DETCT::Transcript; +## use critic + +# ABSTRACT: Object representing a transcript + +## Author : is1 +## Maintainer : is1 +## Created : 2013-01-28 +## Last commit by : $Author$ +## Last modified : $Date$ +## Revision : $Revision$ +## Repository URL : $HeadURL$ + +use warnings; +use strict; +use autodie; +use Carp; +use Try::Tiny; + +use Readonly; +use Class::InsideOut qw( private register id ); + +=head1 SYNOPSIS + + # Brief code examples + +=cut + +# Attributes: +private stable_id => my %stable_id; # e.g. ENSDART00000133571 +private name => my %name; # e.g. cxc64-001 +private description => my %description; # e.g. CXC chemokine 64... +private biotype => my %biotype; # e.g. protein_coding +private seq_name => my %seq_name; # e.g. 5 +private start => my %start; # e.g. 40352744 +private end => my %end; # e.g. 40354399 +private strand => my %strand; # e.g. 1 +private gene => my %gene; # DETCT::Gene + +# Constants +Readonly our $MAX_NAME_LENGTH => 128; + +=method new + + Usage : my $transcript = DETCT::Transcript->new( { + stable_id => 'ENSDART00000133571', + biotype => 'protein_coding', + seq_name => '5', + start => 40352744, + end => 40354399, + strand => 1, + } ); + Purpose : Constructor for transcript objects + Returns : DETCT::Transcript + Parameters : Hashref { + stable_id => String, + name => String or undef, + description => String or undef, + biotype => String, + seq_name => String, + start => +ve Int, + end => +ve Int, + strand => Int (1 or -1), + gene => DETCT::Gene, + } + Throws : No exceptions + Comments : None + +=cut + +sub new { + my ( $class, $arg_ref ) = @_; + my $self = register($class); + $self->set_stable_id( $arg_ref->{stable_id} ); + $self->set_name( $arg_ref->{name} ); + $self->set_description( $arg_ref->{description} ); + $self->set_biotype( $arg_ref->{biotype} ); + $self->set_seq_name( $arg_ref->{seq_name} ); + $self->set_start( $arg_ref->{start} ); + $self->set_end( $arg_ref->{end} ); + $self->set_strand( $arg_ref->{strand} ); + $self->set_gene( $arg_ref->{gene} ); + return $self; +} + +=method stable_id + + Usage : my $stable_id = $transcript->stable_id; + Purpose : Getter for stable id attribute + Returns : String (e.g. "ENSDART00000133571") + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub stable_id { + my ($self) = @_; + return $stable_id{ id $self}; +} + +=method set_stable_id + + Usage : $transcript->set_stable_id('ENSDART00000133571'); + Purpose : Setter for stable id attribute + Returns : undef + Parameters : String (the stable id) + Throws : No exceptions + Comments : None + +=cut + +sub set_stable_id { + my ( $self, $arg ) = @_; + $stable_id{ id $self} = check_stable_id($arg); + return; +} + +=method check_stable_id + + Usage : $stable_id = check_stable_id($stable_id); + Purpose : Check for valid stable id + Returns : String (the valid stable id) + Parameters : String (the stable id) + Throws : If stable id is missing or invalid + Comments : None + +=cut + +sub check_stable_id { + my ($stable_id) = @_; + return $stable_id + if defined $stable_id && $stable_id =~ m/\A [[:upper:]]+ \d{11} \z/xms; + confess 'No stable id specified' if !defined $stable_id; + confess "Invalid stable id ($stable_id) specified"; +} + +=method name + + Usage : my $name = $transcript->name; + Purpose : Getter for name attribute + Returns : String (e.g. "cxc64-001") + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub name { + my ($self) = @_; + return $name{ id $self}; +} + +=method set_name + + Usage : $transcript->set_name('cxc64-001'); + Purpose : Setter for name attribute + Returns : undef + Parameters : String (the name) + Throws : No exceptions + Comments : None + +=cut + +sub set_name { + my ( $self, $arg ) = @_; + $name{ id $self} = _check_name($arg); + return; +} + +# Usage : $name = _check_name($name); +# Purpose : Check for valid name +# Returns : String (the valid name) +# Parameters : String (the name) +# Throws : If name > $MAX_NAME_LENGTH characters +# Comments : None +sub _check_name { + my ($name) = @_; + return $name + if !defined $name + || ( length $name > 0 && length $name <= $MAX_NAME_LENGTH ); + confess 'Name is empty' if !length $name; + confess "Name ($name) longer than $MAX_NAME_LENGTH characters"; +} + +=method description + + Usage : my $description = $transcript->description; + Purpose : Getter for description attribute + Returns : String (e.g. "CXC chemokine 64") + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub description { + my ($self) = @_; + return $description{ id $self}; +} + +=method set_description + + Usage : $transcript->set_description('CXC chemokine 64'); + Purpose : Setter for description attribute + Returns : undef + Parameters : String (the description) + Throws : No exceptions + Comments : None + +=cut + +sub set_description { + my ( $self, $arg ) = @_; + $description{ id $self} = $arg; + return; +} + +=method biotype + + Usage : my $biotype = $transcript->biotype; + Purpose : Getter for biotype attribute + Returns : String (e.g. "protein_coding") + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub biotype { + my ($self) = @_; + return $biotype{ id $self}; +} + +=method set_biotype + + Usage : $transcript->set_biotype('protein_coding'); + Purpose : Setter for biotype attribute + Returns : undef + Parameters : String (the biotype) + Throws : No exceptions + Comments : None + +=cut + +sub set_biotype { + my ( $self, $arg ) = @_; + $biotype{ id $self} = check_biotype($arg); + return; +} + +=method check_biotype + + Usage : $biotype = check_biotype($biotype); + Purpose : Check for valid biotype + Returns : String (the valid biotype) + Parameters : String (the biotype) + Throws : If biotype is missing or invalid (i.e. not alphanumeric) + Comments : None + +=cut + +sub check_biotype { + my ($biotype) = @_; + return $biotype if defined $biotype && $biotype =~ m/\A \w+ \z/xms; + confess 'No biotype specified' if !defined $biotype; + confess "Invalid biotype ($biotype) specified"; +} + +=method seq_name + + Usage : my $seq_name = $transcript->seq_name; + Purpose : Getter for sequence name attribute + Returns : String (e.g. "5") + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub seq_name { + my ($self) = @_; + return $seq_name{ id $self}; +} + +=method set_seq_name + + Usage : $transcript->set_seq_name('5'); + Purpose : Setter for sequence name attribute + Returns : undef + Parameters : String (the sequence name) + Throws : No exceptions + Comments : None + +=cut + +sub set_seq_name { + my ( $self, $arg ) = @_; + $seq_name{ id $self} = check_seq_name($arg); + return; +} + +=method check_seq_name + + Usage : $seq_name = check_seq_name($seq_name); + Purpose : Check for valid sequence name + Returns : String (the valid sequence name) + Parameters : String (the sequence name) + Throws : If sequence name is missing or invalid (i.e. not alphanumeric) + Comments : None + +=cut + +sub check_seq_name { + my ($seq_name) = @_; + return $seq_name if defined $seq_name && $seq_name =~ m/\A \w+ \z/xms; + confess 'No sequence name specified' if !defined $seq_name; + confess "Invalid sequence name ($seq_name) specified"; +} + +=method start + + Usage : my $start = $transcript->start; + Purpose : Getter for start attribute + Returns : +ve Int + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub start { + my ($self) = @_; + return $start{ id $self}; +} + +=method set_start + + Usage : $transcript->set_start(40352744); + Purpose : Setter for start attribute + Returns : undef + Parameters : +ve Int (the start) + Throws : No exceptions + Comments : None + +=cut + +sub set_start { + my ( $self, $arg ) = @_; + $start{ id $self} = check_start($arg); + return; +} + +=method check_start + + Usage : $start = check_start($start); + Purpose : Check for valid start + Returns : +ve Int (the valid start) + Parameters : +ve Int (the start) + Throws : If start is missing or not a positive integer + Comments : None + +=cut + +sub check_start { + my ($start) = @_; + return $start if defined $start && $start =~ m/\A \d+ \z/xms; + confess 'No start specified' if !defined $start; + confess "Invalid start ($start) specified"; +} + +=method end + + Usage : my $end = $transcript->end; + Purpose : Getter for end attribute + Returns : +ve Int + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub end { + my ($self) = @_; + return $end{ id $self}; +} + +=method set_end + + Usage : $transcript->set_end(40352744); + Purpose : Setter for end attribute + Returns : undef + Parameters : +ve Int (the end) + Throws : No exceptions + Comments : None + +=cut + +sub set_end { + my ( $self, $arg ) = @_; + $end{ id $self} = check_end($arg); + return; +} + +=method check_end + + Usage : $end = check_end($end); + Purpose : Check for valid end + Returns : +ve Int (the valid end) + Parameters : +ve Int (the end) + Throws : If end is missing or not a positive integer + Comments : None + +=cut + +sub check_end { + my ($end) = @_; + return $end if defined $end && $end =~ m/\A \d+ \z/xms; + confess 'No end specified' if !defined $end; + confess "Invalid end ($end) specified"; +} + +=method strand + + Usage : my $strand = $transcript->strand; + Purpose : Getter for strand attribute + Returns : Int (1 or -1) + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub strand { + my ($self) = @_; + return $strand{ id $self}; +} + +=method set_strand + + Usage : $transcript->set_strand(1); + Purpose : Setter for strand attribute + Returns : undef + Parameters : Int (the strand) + Throws : No exceptions + Comments : None + +=cut + +sub set_strand { + my ( $self, $arg ) = @_; + $strand{ id $self} = _check_strand($arg); + return; +} + +# Usage : $strand = _check_strand($strand); +# Purpose : Check for valid strand +# Returns : Int (1 or -1) (the valid strand) +# Parameters : Int (1 or -1) (the strand) +# Throws : If strand is missing or not 1 or -1 +# Comments : None +sub _check_strand { + my ($strand) = @_; + return $strand if defined $strand && $strand =~ m/\A \-? 1 \z/xms; + confess 'No strand specified' if !defined $strand; + confess "Invalid strand ($strand) specified"; +} + +=method gene + + Usage : my $gene = $transcript->gene; + Purpose : Getter for gene attribute + Returns : DETCT::Gene + Parameters : None + Throws : No exceptions + Comments : None + +=cut + +sub gene { + my ($self) = @_; + return $gene{ id $self}; +} + +=method set_gene + + Usage : $transcript->set_gene($gene); + Purpose : Setter for gene attribute + Returns : undef + Parameters : DETCT::Gene + Throws : No exceptions + Comments : None + +=cut + +sub set_gene { + my ( $self, $arg ) = @_; + $gene{ id $self} = _check_gene($arg); + return; +} + +# Usage : $gene = _check_gene($gene); +# Purpose : Check for valid gene +# Returns : DETCT::Gene +# Parameters : DETCT::Gene +# Throws : If gene is invalid (i.e. not a DETCT::Gene object) +# Comments : None +sub _check_gene { + my ($gene) = @_; + confess 'Class of gene (', ref $gene, ') not DETCT::Gene' + if defined $gene && !$gene->isa('DETCT::Gene'); + return $gene; +} + +1; diff --git a/perlcritic.rc b/perlcritic.rc new file mode 100644 index 0000000..30963fa --- /dev/null +++ b/perlcritic.rc @@ -0,0 +1,12 @@ +severity = 1 +exclude = RequirePodSections RequireVersionVar + +[Documentation::PodSpelling] +stop_words_file = pod-stop-words.txt + +[Perl::Critic::Policy::BuiltinFunctions::ProhibitStringyEval] +allow_includes = 1 + +[InputOutput::RequireCheckedSyscalls] +functions = :builtins +exclude_functions = print sleep diff --git a/pod-stop-words.txt b/pod-stop-words.txt new file mode 100644 index 0000000..e69de29 diff --git a/script/detag_fastq.pl b/script/detag_fastq.pl new file mode 100644 index 0000000..cd53089 --- /dev/null +++ b/script/detag_fastq.pl @@ -0,0 +1,163 @@ +#!/usr/bin/env perl + +# PODNAME: detag_fastq.pl +# ABSTRACT: Extract tags from transcript counting FASTQ files and process files + +## Author : is1 +## Maintainer : is1 +## Created : 2012-12-15 +## Last commit by : $Author$ +## Last modified : $Date$ +## Revision : $Revision$ +## Repository URL : $HeadURL$ + +use warnings; +use strict; +use autodie; +use Carp; +use Try::Tiny; + +use Getopt::Long; +use Pod::Usage; +use DETCT::Misc::Tag; + +=head1 DESCRIPTION + + +=head1 EXAMPLES + + +=cut + +# Default options +## no critic (ProhibitMagicNumbers) +my $fastq_read1_input; +my $fastq_read2_input; +my $fastq_output_prefix; +my $pre_detag_trim_length = 54; +my $polyt_trim_length = 14; +my $polyt_min_length = 10; +my @read_tags; +my $no_pair_suffix = 0; +my ( $help, $man ); +## use critic + +# Get and check command line options +get_and_check_options(); + +DETCT::Misc::Tag::detag_trim_fastq( + { + fastq_read1_input => $fastq_read1_input, + fastq_read2_input => $fastq_read2_input, + fastq_output_prefix => $fastq_output_prefix, + pre_detag_trim_length => $pre_detag_trim_length, + polyt_trim_length => $polyt_trim_length, + polyt_min_length => $polyt_min_length, + read_tags => \@read_tags, + no_pair_suffix => $no_pair_suffix, + } +); + +# Get and check command line options +sub get_and_check_options { + + # Get options + GetOptions( + 'fastq_read1_input=s' => \$fastq_read1_input, + 'fastq_read2_input=s' => \$fastq_read2_input, + 'fastq_output_prefix=s' => \$fastq_output_prefix, + 'pre_detag_trim_length=i' => \$pre_detag_trim_length, + 'polyt_trim_length=i' => \$polyt_trim_length, + 'polyt_min_length=i' => \$polyt_min_length, + 'read_tags=s@{1,}' => \@read_tags, + 'no_pair_suffix' => \$no_pair_suffix, + 'help' => \$help, + 'man' => \$man, + ) or pod2usage(2); + + # Documentation + if ($help) { + pod2usage(1); + } + elsif ($man) { + pod2usage( -verbose => 2 ); + } + + # Check options + if ( !$fastq_read1_input ) { + pod2usage("--fastq_read1_input must be specified\n"); + } + if ( !$fastq_read2_input ) { + pod2usage("--fastq_read2_input must be specified\n"); + } + if ( !$fastq_output_prefix ) { + pod2usage("--fastq_output_prefix must be specified\n"); + } + if ( !@read_tags ) { + pod2usage("--read_tags must be specified\n"); + } + + return; +} + +=head1 USAGE + + detag_fastq.pl + [--fastq_read1_input file] + [--fastq_read2_input file] + [--fastq_output_prefix prefix] + [--pre_detag_trim_length int] + [--polyt_trim_length int] + [--polyt_min_length int] + [--read_tags tags...] + [--no_pair_suffix] + [--help] + [--man] + +=head1 OPTIONS + +=over 8 + +=item B<--fastq_read1_input FILE> + +Input FASTQ file for read 1. + +=item B<--fastq_read2_input FILE> + +Input FASTQ file for read 2. + +=item B<--fastq_output_prefix FILE> + +Prefix for output FASTQ files. + +=item B<--pre_detag_trim_length INT> + +Length to trim reads to before detagging. + +=item B<--polyt_trim_length INT> + +Length of (largely) polyT to be trimmed. + +=item B<--polyt_min_length INT> + +Minimum number of consecutive Ts in length of polyT. + +=item B<--read_tags TAGS> + +Read tags. + +=item B<--no_pair_suffix> + +Input FASTQ file don't have pair suffixes. + +=item B<--help> + +Print a brief help message and exit. + +=item B<--man> + +Print this script's manual page and exit. + +=back + +=cut diff --git a/script/make_test_fasta.pl b/script/make_test_fasta.pl new file mode 100644 index 0000000..51f589b --- /dev/null +++ b/script/make_test_fasta.pl @@ -0,0 +1,164 @@ +#!/usr/bin/env perl + +# PODNAME: make_test_fasta.pl +# ABSTRACT: Make transcript counting test file in FASTA format + +## Author : is1 +## Maintainer : is1 +## Created : 2012-11-12 +## Last commit by : $Author$ +## Last modified : $Date$ +## Revision : $Revision$ +## Repository URL : $HeadURL$ + +use warnings; +use strict; +use autodie; +use Carp; +use Try::Tiny; + +use Getopt::Long; +use Pod::Usage; + +=head1 DESCRIPTION + +This script generates test transcript counting FASTA files. The number and +maximum length of chromosomes can be varied. + +=head1 EXAMPLES + + # Generate random FASTA file using default values + perl script/make_test_fasta.pl > test.fa + + # Generate FASTA file with reproducible chromosomes using default values + perl script/make_test_fasta.pl --seed 1 > test.fa + + # Generate FASTA file with 25 chromosomes (each up to 50 Mbp long) + perl script/make_test_fasta.pl \ + --seq_region_count 25 \ + --seq_region_max_length 50_000_000 \ + > test.fa + +=cut + +# Default options +## no critic (ProhibitMagicNumbers) +my $seed; +my $seq_region_count = 1; +my $seq_region_max_length = 1_000_000; +my ( $help, $man ); +## use critic + +# Get and check command line options +get_and_check_options(); + +# Ensure reproducible chromosome lengths if seed set +if ( defined $seed ) { + srand $seed; +} + +# Make each chromosome of random length +my %length_of; +foreach my $seq_region ( 1 .. $seq_region_count ) { + my $length = int rand( $seq_region_max_length + 1 ); + $length_of{$seq_region} = $length; +} + +# Ensure sequences are always random +srand; + +# Generate sequence for each chromosome one by one +foreach my $seq_region ( 1 .. $seq_region_count ) { + printf ">%s\n", $seq_region; + my $length_required = $length_of{$seq_region}; + my $length_printed = 0; + while ($length_required) { + ## no critic (ProhibitMagicNumbers) + print qw( A G C T a g c t ) [ int rand 8 ]; + ## use critic + $length_required--; + $length_printed++; + + # Wrap every 80 bases + ## no critic (ProhibitMagicNumbers) + if ( !( $length_printed % 80 ) ) { + ## use critic + print "\n"; + } + } + + # Final new line if haven't just printed one + ## no critic (ProhibitMagicNumbers) + if ( $length_printed % 80 ) { + ## use critic + print "\n"; + } +} + +# Get and check command line options +sub get_and_check_options { + + # Get options + GetOptions( + 'seed=i' => \$seed, + 'seq_region_count=i' => \$seq_region_count, + 'seq_region_max_length=i' => \$seq_region_max_length, + 'help' => \$help, + 'man' => \$man, + ) or pod2usage(2); + + # Documentation + if ($help) { + pod2usage(1); + } + elsif ($man) { + pod2usage( -verbose => 2 ); + } + + # Check options + if ( !$seq_region_count ) { + pod2usage("--seq_region_count must be a positive integer\n"); + } + if ( !$seq_region_max_length ) { + pod2usage("--seq_region_max_length must be a positive integer\n"); + } + + return; +} + +=head1 USAGE + + make_test_fasta.pl + [--seed seed] + [--seq_region_count int] + [--seq_region_max_length int] + [--help] + [--man] + +=head1 OPTIONS + +=over 8 + +=item B<--seed INT> + +Random seed (to get reproducible chromosome lengths). + +=item B<--seq_region_count INT> + +Number of seq regions (default to 1). + +=item B<--seq_region_max_length INT> + +Maximum length of each seq region (defaults to 1,000,000 bp). + +=item B<--help> + +Print a brief help message and exit. + +=item B<--man> + +Print this script's manual page and exit. + +=back + +=cut diff --git a/script/make_test_fastq.pl b/script/make_test_fastq.pl new file mode 100644 index 0000000..a4ce378 --- /dev/null +++ b/script/make_test_fastq.pl @@ -0,0 +1,276 @@ +#!/usr/bin/env perl + +# PODNAME: make_test_fastq.pl +# ABSTRACT: Make transcript counting test files in FASTQ format + +## Author : is1 +## Maintainer : is1 +## Created : 2013-01-08 +## Last commit by : $Author$ +## Last modified : $Date$ +## Revision : $Revision$ +## Repository URL : $HeadURL$ + +use warnings; +use strict; +use autodie; +use Carp; +use Try::Tiny; + +use Getopt::Long; +use Pod::Usage; + +=head1 DESCRIPTION + +This script generates test transcript counting FASTQ files. + +=head1 EXAMPLES + + # Generate random FASTQ files using default values + perl script/make_test_fastq.pl --read_tags NNNNCTACCA + + # Generate FASTQ files with reproducible reads using default values + perl script/make_test_fastq.pl --seed 1 + + # Generate random FASTQ files with 1000 read pairs and 54 bp reads + perl script/make_test_fastq.pl \ + --read_tags NNNNCTACCA \ + --read_pair_count 1000 \ + --read_length 54 + +=cut + +# Default options +## no critic (ProhibitMagicNumbers) +my $seed; +my $output_prefix = 'test'; +my $read_pair_count = 100; +my @read_tags; +my $read_length = 75; +my $polyt_length = 14; +my ( $help, $man ); +## use critic + +# Get and check command line options +get_and_check_options(); + +# Assume all tags are same length +my $tag_length = length $read_tags[0]; + +# Add dummy tag for reads that don't match a real tag +push @read_tags, q{X} x $tag_length; + +# Ensure reproducible FASTQ files if seed set +if ( defined $seed ) { + srand $seed; +} + +# Generate start of each read name +## no critic (ProhibitMagicNumbers) +my $read_name_base = 'HS'; +$read_name_base .= ( int rand 50 ) + 1; # Instrument name +$read_name_base .= q{_}; +$read_name_base .= ( int rand 20_000 ) + 1; # Run +$read_name_base .= q{:}; +$read_name_base .= ( int rand 8 ) + 1; # Flowcell lane +$read_name_base .= q{:}; +## use critic + +my %tag_count; + +## no critic (RequireBriefOpen) +open my $fh1, '>', $output_prefix . '_1.fastq'; +open my $fh2, '>', $output_prefix . '_2.fastq'; +## use critic +foreach ( 1 .. $read_pair_count ) { + my $read_name = get_read_name($read_name_base); + + my $tag = @read_tags[ int rand scalar @read_tags ]; # Random tag + + # 20% of read 1s have no polyT + my $has_polyt = int rand 5 ? 1 : 0; ## no critic (ProhibitMagicNumbers) + + print {$fh1} q{@}, $read_name, '/1', "\n"; + print {$fh1} get_read1_seq( $read_length, $tag, $has_polyt ), "\n"; + print {$fh1} "+\n"; + print {$fh1} q{~} x $read_length, "\n"; + print {$fh2} q{@}, $read_name, '/2', "\n"; + print {$fh2} get_read2_seq($read_length), "\n"; + print {$fh2} "+\n"; + print {$fh2} q{~} x $read_length, "\n"; + + if ( !$has_polyt ) { + $tag = $read_tags[-1]; # Dummy tag + } + $tag_count{$tag}++; +} +close $fh1; +close $fh2; + +# Display tag counts +foreach my $read_tag (@read_tags) { + print $output_prefix, "\t", $read_tag, ":\t", + ( $tag_count{$read_tag} || 0 ), "\n"; +} + +# Construct read name +sub get_read_name { + my ( $read_name, ) = @_; + + ## no critic (ProhibitMagicNumbers) + $read_name .= ( int rand 3_000 ) + 1; # Tile number + $read_name .= q{:}; + $read_name .= ( int rand 20_000 ) + 1; # Cluster x coordinate + $read_name .= q{:}; + $read_name .= ( int rand 200_000 ) + 1; # Cluster y coordinate + ## use critic + + return $read_name; +} + +# Get read 1 sequence (just random but with tag) +sub get_read1_seq { + my ( $read_len, $tag, $has_polyt ) = @_; + + my $is_dummy_tag = $tag =~ m/X/xms ? 1 : 0; # If tag is X then no tag + + # Replace IUPAC codes in tag with random bases + $tag =~ s/ N / qw( A G C T )[ int rand 4 ] /xmsge; + $tag =~ s/ B / qw( G C T )[ int rand 3 ] /xmsge; + $tag =~ s/ D / qw( A G T )[ int rand 3 ] /xmsge; + $tag =~ s/ H / qw( A C T )[ int rand 3 ] /xmsge; + $tag =~ s/ V / qw( A G C )[ int rand 3 ] /xmsge; + $tag =~ s/ R / qw( A G )[ int rand 2 ] /xmsge; + $tag =~ s/ Y / qw( C T )[ int rand 2 ] /xmsge; + $tag =~ s/ K / qw( G T )[ int rand 2 ] /xmsge; + $tag =~ s/ M / qw( A C )[ int rand 2 ] /xmsge; + $tag =~ s/ S / qw( G C )[ int rand 2 ] /xmsge; + $tag =~ s/ W / qw( A T )[ int rand 2 ] /xmsge; + $tag =~ s/ X / qw( A G C T )[ int rand 4 ] /xmsge; # Not actually IUPAC + + # Make the last two bases be Ns so should never match a real tag + if ( $is_dummy_tag && $has_polyt ) { # No need if not polyT + substr $tag, -2, 2, 'NN'; ## no critic (ProhibitMagicNumbers) + } + + # 20% of reads have a single mismatch somewhere in the tag + if ( int rand 5 ) { ## no critic (ProhibitMagicNumbers) + my $mismatch_base = int rand length $tag; + my $base = substr $tag, $mismatch_base, 1; + $base =~ tr/AGCT/TCGA/; + substr $tag, $mismatch_base, 1, $base; + } + + # Read begins with tag then polyT (or add polyA if doesn't have polyT) + my $seq = $tag; + $seq .= $has_polyt ? q{T} x $polyt_length : q{A} x $polyt_length; + $read_len -= length $seq; + + # Rest of read is random + ## no critic (ProhibitMagicNumbers) + $seq .= join q{}, map { qw( A G C T ) [ int rand 4 ] } 1 .. $read_len; + ## use critic + + return $seq; +} + +# Get read 2 sequence (just random) +sub get_read2_seq { + my ($read_len) = @_; + + ## no critic (ProhibitMagicNumbers) + return join q{}, map { qw( A G C T ) [ int rand 4 ] } 1 .. $read_len; + ## use critic +} + +# Get and check command line options +sub get_and_check_options { + + # Get options + GetOptions( + 'seed=i' => \$seed, + 'output_prefix=s' => \$output_prefix, + 'read_pair_count=i' => \$read_pair_count, + 'read_tags=s@{1,}' => \@read_tags, + 'read_length=i' => \$read_length, + 'polyt_length=i' => \$polyt_length, + 'help' => \$help, + 'man' => \$man, + ) or pod2usage(2); + + # Documentation + if ($help) { + pod2usage(1); + } + elsif ($man) { + pod2usage( -verbose => 2 ); + } + + # Check options + if ( !$output_prefix ) { + pod2usage("--output_prefix must be specified\n"); + } + if ( !$read_pair_count ) { + pod2usage("--read_pair_count must be a positive integer\n"); + } + if ( !$read_length ) { + pod2usage("--read_length must be a positive integer\n"); + } + if ( !@read_tags ) { + pod2usage("--read_tags must be specified\n"); + } + + return; +} + +=head1 USAGE + + make_test_fastq.pl + [--seed seed] + [--output_prefix prefix] + [--read_pair_count int] + [--read_tags tags...] + [--read_length int] + [--polyt_length int] + [--help] + [--man] + +=head1 OPTIONS + +=over 8 + +=item B<--seed INT> + +Random seed (to get reproducible chromosome lengths). + +=item B<--output_prefix FILE> + +Prefix for output FASTQ files. + +=item B<--read_pair_count INT> + +Number of read pairs aligned to each seq region (defaults to 100). + +=item B<--read_tags TAGS> + +Read tags. + +=item B<--read_length INT> + +Length of reads (defaults to 75 bp). + +=item B<--polyt_length INT> + +Length of polyT in read 1. + +=item B<--help> + +Print a brief help message and exit. + +=item B<--man> + +Print this script's manual page and exit. + +=back + +=cut diff --git a/script/make_test_sam.pl b/script/make_test_sam.pl new file mode 100644 index 0000000..e88d2a1 --- /dev/null +++ b/script/make_test_sam.pl @@ -0,0 +1,630 @@ +#!/usr/bin/env perl + +# PODNAME: make_test_sam.pl +# ABSTRACT: Make transcript counting test file in SAM format + +## Author : is1 +## Maintainer : is1 +## Created : 2012-09-14 +## Last commit by : $Author$ +## Last modified : $Date$ +## Revision : $Revision$ +## Repository URL : $HeadURL$ + +use warnings; +use strict; +use autodie; +use Carp; +use Try::Tiny; + +use Getopt::Long; +use Pod::Usage; +use Readonly; + +=head1 DESCRIPTION + +This script generates test transcript counting SAM files. The number and maximum +length of chromosomes can be varied along with the number and length of reads. +Read tags must be specified. + +=head1 EXAMPLES + + # Generate random BAM file using default values + perl script/make_test_sam.pl --read_tags NNNNCTACCA \ + | samtools view -bS - | samtools sort - test + + # Generate BAM file with reproducible chromosomes using default values + perl script/make_test_sam.pl --seed 1 --read_tags NNNNCTACCA \ + | samtools view -bS - | samtools sort - test + + # Generate BAM file with 25 chromosomes (each up to 50 Mbp long), 1000 + # alignments per chromosome and four 10mer tags + perl script/make_test_sam.pl \ + --seq_region_count 25 \ + --seq_region_max_length 50_000_000 \ + --read_pair_count 1000 \ + --read_tags NNNNCTACCA NNNNAAGTTA NNNNTTAATC NNNNTAGACA \ + | samtools view -bS - | samtools sort - test + +=cut + +# Constants from http://samtools.sourceforge.net/SAM1.pdf + +# Regexps for checking alignment line mandatory fields +Readonly our %ALIGNMENT_REGEXP_MANDATORY => ( + qname => qr/\A [!-?A-~]{1,255} \z/xms, + rname => qr/\A [*] | [!-()+-<>-~][!-~]* \z/xms, + cigar => qr/\A [*] | (\d+[MIDNSHPX=])+ \z/xms, + rnext => qr/\A [*] | = | [!-()+-<>-~][!-~]* \z/xms, + seq => qr/\A [*] | [[:alpha:]=.]+ \z/xms, + qual => qr/\A [!-~]+ \z/xms, +); + +# Ranges for checking alignment line mandatory fields +Readonly our %ALIGNMENT_RANGE_MANDATORY => ( + flag => [ 0, 2**16 - 1 ], + pos => [ 0, 2**29 - 1 ], + mapq => [ 0, 2**8 - 1 ], + pnext => [ 0, 2**29 - 1 ], + tlen => [ -2**29 + 1, 2**29 - 1 ], +); + +# Regexps for checking alignment line optional fields +Readonly our %ALIGNMENT_REGEXP_OPTIONAL => ( + A => qr/\A [!-~] \z/xms, + i => qr/\A [-+]?\d+ \z/xms, + f => qr/\A [-+]?\d*[.]?\d+([eE][-+]?\d+)? \z/xms, + Z => qr/\A [ !-~]+ \z/xms, + H => qr/\A [\dA-F]+ \z/xms, + B => qr/\A [cCsSiIf](,[-+]?\d*[.]?\d+([eE][-+]?\d+)?)+ \z/xms, +); + +# Bits of flag field +Readonly our $FLAG_READ_PAIRED => 1; +Readonly our $FLAG_PROPER_PAIR => 2; +Readonly our $FLAG_READ_UNMAPPED => 4; +Readonly our $FLAG_MATE_UNMAPPED => 8; +Readonly our $FLAG_READ_REVERSE_STRAND => 16; +Readonly our $FLAG_MATE_REVERSE_STRAND => 32; +Readonly our $FLAG_FIRST_IN_PAIR => 64; +Readonly our $FLAG_SECOND_IN_PAIR => 128; +Readonly our $FLAG_DUPLICATE => 1024; + +# Chance one read of a pair is unmapped +Readonly our $CHANCE_UNMAPPED => 0.1; + +# Default options +## no critic (ProhibitMagicNumbers) +my $seed; +my $seq_region_count = 1; +my $seq_region_max_length = 1_000_000; +my $read_pair_count = 100; +my @read_tags; +my $read1_length = 30; +my $read2_length = 54; +my ( $help, $man ); +## use critic + +# Get and check command line options +get_and_check_options(); + +# Ensure reproducible chromosome lengths if seed set +if ( defined $seed ) { + srand $seed; +} + +# Construct command line +my @cl = ('make_test_sam.pl'); +if ($seed) { + push @cl, '--seed', $seed; +} +push @cl, '--seq_region_count', $seq_region_count; +push @cl, '--seq_region_max_length', $seq_region_max_length; +push @cl, '--read_pair_count', $read_pair_count; +push @cl, '--read_tags', @read_tags; +push @cl, '--read1_length', $read1_length; +push @cl, '--read2_length', $read2_length; +my $cl = join q{ }, @cl; + +# Print HD and RG SAM header +print header_line( 'HD', [ 'VN', '1.4' ], [ 'SO', 'unsorted' ] ); +print header_line( 'RG', [ 'ID', q{1} ], [ 'SM', 'TC' ] ); +print header_line( + 'PG', + [ 'ID', q{1} ], + [ 'PN', 'make_test_sam.pl' ], + [ 'CL', $cl ] +); + +# Make each chromosome of random length and print SQ SAM headers +my %length_of; +foreach my $seq_region ( 1 .. $seq_region_count ) { + my $length = int rand( $seq_region_max_length + 1 ); + $length_of{$seq_region} = $length; + print header_line( 'SQ', [ 'SN', $seq_region ], [ 'LN', $length ] ); +} + +# Ensure alignments are always random +srand; + +# Generate start of each read name +## no critic (ProhibitMagicNumbers) +my $qname_base = 'HS'; +$qname_base .= ( int rand 50 ) + 1; # Instrument name +$qname_base .= q{_}; +$qname_base .= ( int rand 20_000 ) + 1; # Run +$qname_base .= q{:}; +$qname_base .= ( int rand 8 ) + 1; # Flowcell lane +$qname_base .= q{:}; +## use critic + +# Generate alignments for each chromosome one by one +foreach my $seq_region ( 1 .. $seq_region_count ) { + foreach ( 1 .. $read_pair_count ) { + my $read1_qname = get_qname( $qname_base, get_read_tag() ); + my $read2_qname = $read1_qname; # Always the same + my ( $read1_pos, $read2_pos ) = + get_pos( $length_of{$seq_region}, $read1_length, $read2_length ); + my ( $read1_flag, $read2_flag ) = get_flag( $read1_pos, $read2_pos ); + my ( $read1_tlen, $read2_tlen ) = + get_tlen( $read1_pos, $read2_pos, $read1_length, $read2_length ); + ( $read1_flag, $read2_flag, $read1_pos, $read2_pos ) = + get_unmapped( $read1_flag, $read2_flag, $read1_pos, $read2_pos ); + my ($read1_cigar) = get_cigar($read1_length); + my ($read2_cigar) = get_cigar($read2_length); + my ($read1_nm) = get_nm(); + my ($read2_nm) = get_nm(); + + # Rarely generate 50 to 99 real duplicates to simulate peaks + ## no critic (ProhibitMagicNumbers) + my $num_real_duplicates = 0; + if ( rand $read_pair_count < 2 ) { + $num_real_duplicates = int( rand 50 ) + 50; + } + ## use critic + + # Generate PCR duplicates (i.e. marked as duplicates) + ## no critic (ProhibitMagicNumbers) + my $num_pcr_duplicates = poisson_number(0.6); + ## use critic + + my $num_duplicates = $num_real_duplicates + $num_pcr_duplicates; + foreach my $read_pair_count ( 1 .. $num_duplicates + 1 ) { + + # First read + print alignment_line( + qname => $read1_qname, + flag => $read1_flag, + rname => $seq_region, + pos => $read1_pos, + mapq => 255, + cigar => $read1_cigar, + rnext => q{=}, + pnext => $read2_pos, + tlen => $read1_tlen, + seq => get_seq($read1_length), + qual => get_qual($read1_length), + opt => { + 'NM:i' => $read1_nm, + 'RG:Z' => q{1}, + }, + ); + + # Second read + print alignment_line( + qname => $read2_qname, + flag => $read2_flag, + rname => $seq_region, + pos => $read2_pos, + mapq => 255, + cigar => $read2_cigar, + rnext => q{=}, + pnext => $read1_pos, + tlen => $read2_tlen, + seq => get_seq($read2_length), + qual => get_qual($read2_length), + opt => { + 'NM:i' => $read2_nm, + 'RG:Z' => q{1}, + }, + ); + + if ( $read_pair_count == $num_real_duplicates + 1 ) { + + # Mark rest of reads as duplicates + $read1_flag = $read1_flag | $FLAG_DUPLICATE; + $read2_flag = $read2_flag | $FLAG_DUPLICATE; + } + } + } +} + +# Generate SAM header line +sub header_line { + my ( $record_type, @data ) = @_; + + my $header_line = q{}; + + if ( $record_type !~ m/\A [[:alpha:]][[:alpha:]] \z/xms ) { + confess 'Invalid record type (', $record_type, q{)}; + } + + $header_line .= q{@} . $record_type; + + foreach my $datum (@data) { + if ( ref $datum ne 'ARRAY' ) { + confess 'Arrayref of tag / value pairs is required (not ', + ref $datum, q{)}; + } + my ( $tag, $value ) = @{$datum}; + if ( $tag !~ m/\A [[:alpha:]][[:alpha:]\d] \z/xms ) { + confess 'Invalid tag (', $tag, q{)}; + } + if ( $value !~ m/\A [ -~]+ \z/xms ) { + confess 'Invalid value (', $value, q{)}; + } + + $header_line .= "\t" . $tag . q{:} . $value; + } + + $header_line .= "\n"; + + return $header_line; +} + +# Generate SAM alignment line +sub alignment_line { + my (%data) = @_; + + # Check string fields + foreach my $field ( sort keys %ALIGNMENT_REGEXP_MANDATORY ) { + if ( $data{$field} !~ $ALIGNMENT_REGEXP_MANDATORY{$field} ) { + confess 'Invalid ', uc $field, ' (', $data{$field}, q{)}; + } + } + + # Check int fields + foreach my $field ( sort keys %ALIGNMENT_RANGE_MANDATORY ) { + if ( $data{$field} < $ALIGNMENT_RANGE_MANDATORY{$field}->[0] + || $data{$field} > $ALIGNMENT_RANGE_MANDATORY{$field}->[1] ) + { + confess 'Invalid ', uc $field, ' (', $data{$field}, q{)}; + } + } + + # Mandatory fields + my $alignment_line = join "\t", $data{qname}, $data{flag}, $data{rname}, + $data{pos}, $data{mapq}, $data{cigar}, $data{rnext}, $data{pnext}, + $data{tlen}, $data{seq}, $data{qual}; + + # Optional fields + if ( exists $data{opt} ) { + foreach my $tag_type ( keys %{ $data{opt} } ) { + my $value = $data{opt}->{$tag_type}; + my ( $tag, $type ) = split /:/xms, $tag_type; + + # Validate tag + if ( $tag !~ /\A [[:alpha:]][[:alpha:]\d] \z/xms ) { + confess 'Invalid tag (', $tag, q{)}; + } + + # Validate type + if ( !exists $ALIGNMENT_REGEXP_OPTIONAL{$type} ) { + confess 'Invalid type (', $type, q{)}; + } + + # Validate value + if ( $value !~ $ALIGNMENT_REGEXP_OPTIONAL{$type} ) { + confess 'Invalid ', $tag, ' (', $value, q{)}; + } + + $alignment_line .= "\t"; + $alignment_line .= join q{:}, $tag, $type, $value; + } + } + + $alignment_line .= "\n"; + + return $alignment_line; +} + +# Get a random read tag and substitute random bases +sub get_read_tag { + my $tag = $read_tags[ int rand $#read_tags + 1 ]; + + # Replace IUPAC code with random bases + $tag =~ s/ N / qw( A G C T )[ int rand 4 ] /xmsge; + $tag =~ s/ B / qw( G C T )[ int rand 3 ] /xmsge; + $tag =~ s/ D / qw( A G T )[ int rand 3 ] /xmsge; + $tag =~ s/ H / qw( A C T )[ int rand 3 ] /xmsge; + $tag =~ s/ V / qw( A G C )[ int rand 3 ] /xmsge; + $tag =~ s/ R / qw( A G )[ int rand 2 ] /xmsge; + $tag =~ s/ Y / qw( C T )[ int rand 2 ] /xmsge; + $tag =~ s/ K / qw( G T )[ int rand 2 ] /xmsge; + $tag =~ s/ M / qw( A C )[ int rand 2 ] /xmsge; + $tag =~ s/ S / qw( G C )[ int rand 2 ] /xmsge; + $tag =~ s/ W / qw( A T )[ int rand 2 ] /xmsge; + + return $tag; +} + +# Construct read name +sub get_qname { + my ( $qname, $read_tag ) = @_; + + ## no critic (ProhibitMagicNumbers) + $qname .= ( int rand 3_000 ) + 1; # Tile number + $qname .= q{:}; + $qname .= ( int rand 20_000 ) + 1; # Cluster x coordinate + $qname .= q{:}; + $qname .= ( int rand 200_000 ) + 1; # Cluster y coordinate + $qname .= q{#}; + $qname .= $read_tag; + ## use critic + + return $qname; +} + +# Get position for both reads +sub get_pos { + my ( $seq_region_len, $read1_len, $read2_len ) = @_; + + my ( $read1_pos, $read2_pos ); + + my $pair_ok = 0; + + while ( !$pair_ok ) { + $read1_pos = ( int rand $seq_region_len ) + 1; + $read2_pos = ( int rand $seq_region_len ) + 1; + $pair_ok = 1; + + my $read1_end = $read1_pos + $read1_len - 1; + my $read2_end = $read2_pos + $read2_len - 1; + + # Check reads are within seq region + if ( $read1_end > $seq_region_len ) { + $pair_ok = 0; + } + if ( $read2_end > $seq_region_len ) { + $pair_ok = 0; + } + + # Check reads don't overlap + if ( $read1_pos <= $read2_end && $read1_end >= $read2_pos ) { + $pair_ok = 0; + } + } + + return $read1_pos, $read2_pos; +} + +# Get flags for both reads (http://picard.sourceforge.net/explain-flags.html) +sub get_flag { + my ( $read1_pos, $read2_pos ) = @_; + + my $read1_flag = $FLAG_READ_PAIRED | $FLAG_PROPER_PAIR; + my $read2_flag = $FLAG_READ_PAIRED | $FLAG_PROPER_PAIR; + + if ( $read1_pos < $read2_pos ) { + $read1_flag = $read1_flag | $FLAG_MATE_REVERSE_STRAND; + $read2_flag = $read2_flag | $FLAG_READ_REVERSE_STRAND; + } + else { + $read1_flag = $read1_flag | $FLAG_READ_REVERSE_STRAND; + $read2_flag = $read2_flag | $FLAG_MATE_REVERSE_STRAND; + } + + $read1_flag = $read1_flag | $FLAG_FIRST_IN_PAIR; + $read2_flag = $read2_flag | $FLAG_SECOND_IN_PAIR; + + return $read1_flag, $read2_flag; +} + +# Get template length for both reads +sub get_tlen { + my ( $read1_pos, $read2_pos, $read1_len, $read2_len ) = @_; + + my ( $read1_tlen, $read2_tlen ); + + if ( $read1_pos < $read2_pos ) { + $read1_tlen = $read2_pos - $read1_pos + $read2_len; + $read2_tlen = -$read1_tlen; + } + else { + $read2_tlen = $read1_pos - $read2_pos + $read1_len; + $read1_tlen = -$read2_tlen; + } + + return $read1_tlen, $read2_tlen; +} + +# Adjust flags and positions if a read is unmapped +sub get_unmapped { + my ( $read1_flag, $read2_flag, $read1_pos, $read2_pos ) = @_; + + if ( rand() < $CHANCE_UNMAPPED ) { + if ( rand() < 0.5 ) { ## no critic (ProhibitMagicNumbers) + # Read 1 unmapped + $read1_flag = $read1_flag | $FLAG_READ_UNMAPPED; + $read2_flag = $read2_flag | $FLAG_MATE_UNMAPPED; + $read1_pos = $read2_pos; + } + else { + # Read 2 unmapped + $read2_flag = $read2_flag | $FLAG_READ_UNMAPPED; + $read1_flag = $read1_flag | $FLAG_MATE_UNMAPPED; + $read2_pos = $read1_pos; + } + } + + return $read1_flag, $read2_flag, $read1_pos, $read2_pos; +} + +# Get sequence (just random) +sub get_seq { + my ($read_len) = @_; + + ## no critic (ProhibitMagicNumbers) + return join q{}, map { qw( A G C T ) [ int rand 4 ] } 1 .. $read_len; + ## use critic +} + +# Get CIGAR string containing random soft clipping +sub get_cigar { + my ($read_len) = @_; + + my $m = $read_len; # Length of alignment match + + ## no critic (ProhibitMagicNumbers) + my $s1 = poisson_number(0.7); # Soft clipping at start of alignment + my $s2 = poisson_number(0.7); # Soft clipping at end of alignment + ## use critic + + $m = $m - $s1 - $s2; + + # Construct CIGAR + + my $cigar = $m . q{M}; + + if ($s1) { + $cigar = $s1 . q{S} . $cigar; + } + if ($s2) { + $cigar = $cigar . $s2 . q{S}; + } + + return $cigar; +} + +# Get quality +sub get_qual { + my ($read_len) = @_; + + return q{~} x $read_len; +} + +# Get random number of mismatches for a read +sub get_nm { + ## no critic (ProhibitMagicNumbers) + return poisson_number(0.6); # ~ e^-0.5, so skewed towards 0 and 1 + ## use critic +} + +# Generate random Poisson-distributed number using Knuth's algorithm +sub poisson_number { + my ($l) = @_; # e^-lambda + + my $k = 0; + my $p = 1; + + while ( $p > $l ) { + $k++; + $p = $p * rand; + } + + return $k - 1; +} + +# Get and check command line options +sub get_and_check_options { + + # Get options + GetOptions( + 'seed=i' => \$seed, + 'seq_region_count=i' => \$seq_region_count, + 'seq_region_max_length=i' => \$seq_region_max_length, + 'read_pair_count=i' => \$read_pair_count, + 'read_tags=s@{1,}' => \@read_tags, + 'read1_length=i' => \$read1_length, + 'read2_length=i' => \$read2_length, + 'help' => \$help, + 'man' => \$man, + ) or pod2usage(2); + + # Documentation + if ($help) { + pod2usage(1); + } + elsif ($man) { + pod2usage( -verbose => 2 ); + } + + # Check options + if ( !$seq_region_count ) { + pod2usage("--seq_region_count must be a positive integer\n"); + } + if ( !$seq_region_max_length ) { + pod2usage("--seq_region_max_length must be a positive integer\n"); + } + if ( !$read_pair_count ) { + pod2usage("--read_pair_count must be a positive integer\n"); + } + if ( !$read1_length ) { + pod2usage("--read1_length must be a positive integer\n"); + } + if ( !$read2_length ) { + pod2usage("--read2_length must be a positive integer\n"); + } + if ( !@read_tags ) { + pod2usage("--read_tags must be specified\n"); + } + + return; +} + +=head1 USAGE + + make_test_sam.pl + [--seed seed] + [--seq_region_count int] + [--seq_region_max_length int] + [--read_pair_count int] + [--read_tags tags...] + [--read1_length int] + [--read2_length int] + [--help] + [--man] + +=head1 OPTIONS + +=over 8 + +=item B<--seed INT> + +Random seed (to get reproducible chromosome lengths). + +=item B<--seq_region_count INT> + +Number of seq regions (default to 1). + +=item B<--seq_region_max_length INT> + +Maximum length of each seq region (defaults to 1,000,000 bp). + +=item B<--read_pair_count INT> + +Number of read pairs aligned to each seq region (defaults to 100). + +=item B<--read_tags TAGS> + +Read tags. + +=item B<--read1_length INT> + +Length of read 1 after trimming (defaults to 30 bp). + +=item B<--read2_length INT> + +Length of read 2 (defaults to 54 bp). + +=item B<--help> + +Print a brief help message and exit. + +=item B<--man> + +Print this script's manual page and exit. + +=back + +=cut diff --git a/script/run_de_pipeline.pl b/script/run_de_pipeline.pl new file mode 100644 index 0000000..2f61ed6 --- /dev/null +++ b/script/run_de_pipeline.pl @@ -0,0 +1,236 @@ +#!/usr/bin/env perl + +# PODNAME: run_de_pipeline.pl +# ABSTRACT: Run DETCT differential expression pipeline + +## Author : is1 +## Maintainer : is1 +## Created : 2012-09-26 +## Last commit by : $Author$ +## Last modified : $Date$ +## Revision : $Revision$ +## Repository URL : $HeadURL$ + +use warnings; +use strict; +use autodie; +use Carp; +use Try::Tiny; + +use Probe::Perl; +use Getopt::Long; +use Pod::Usage; +use English qw( -no_match_vars ); +use File::Spec; +use File::Slurp; +use DETCT::Pipeline::WithDiffExprStages; +use DETCT::Analysis; + +=head1 DESCRIPTION + + + +=head1 EXAMPLES + + + +=cut + +# Default options +my $scheduler = 'lsf'; +my $analysis_dir = q{.}; +my $analysis_yaml = File::Spec->catfile( $analysis_dir, 'analysis.yaml' ); +my $stages_yaml = File::Spec->catfile( $analysis_dir, 'stages.yaml' ); +## no critic (ProhibitMagicNumbers) +my $max_retries = 10; +my $sleep_time = 600; # 10 minutes +## use critic +my $stage_to_run; +my $component_to_run; +my $verbose; +my ( $help, $man ); + +# Get command line (including interpreter and options) +my $cmd_line = get_cmd_line(); + +# Get and check command line options +get_and_check_options(); + +# Create analysis +my $analysis = DETCT::Analysis->new_from_yaml($analysis_yaml); + +# Create pipeline +my $pipeline = DETCT::Pipeline::WithDiffExprStages->new( + { + scheduler => $scheduler, + analysis_dir => $analysis_dir, + analysis => $analysis, + cmd_line => $cmd_line, + max_retries => $max_retries, + sleep_time => $sleep_time, + verbose => $verbose, + } +); + +# Add stages to pipeline +$pipeline->add_stages_from_yaml($stages_yaml); + +# Are we running the main pipeline or running a specific component of a specific +# stage (i.e. a job to be run under LSF or locally)? +if ($stage_to_run) { + $pipeline->set_stage_to_run( $pipeline->get_stage_by_name($stage_to_run) ); +} +if ($component_to_run) { + $pipeline->set_component_to_run($component_to_run); +} + +# Turn off verbose output when running specific components +if ( $pipeline->stage_to_run && $pipeline->component_to_run ) { + $pipeline->set_verbose(0); +} + +# Write overview of pipeline input and config files to log file +if ( !$pipeline->stage_to_run && !$pipeline->component_to_run ) { + my @log = map { "$_\n" } $pipeline->input_overview; + push @log, "\nYAML analysis config file:\n\n", read_file($analysis_yaml); + push @log, "\nYAML stages config file:\n\n", read_file($stages_yaml); + $pipeline->write_log_file( 'de.log', @log ); +} + +# Print overview of pipeline input +$pipeline->say_if_verbose( $pipeline->input_overview ); + +# Run pipeline +$pipeline->run(); + +# Get entire command line +sub get_cmd_line { + + # Get all lib directories + my %lib = map { $_ => 1 } @INC; + + # Remove default lib directories + foreach my $lib ( Probe::Perl->perl_inc() ) { + delete $lib{$lib}; + } + + # Remove PERL5LIB lib directories + foreach my $lib ( split /:/xms, $ENV{PERL5LIB} ) { + delete $lib{$lib}; + } + + # Reconstruct -I lib directories + my @libs; + foreach my $lib ( keys %lib ) { + push @libs, '-I' . $lib; + } + + return join q{ }, Probe::Perl->find_perl_interpreter(), @libs, + $PROGRAM_NAME, @ARGV; +} + +# Get and check command line options +sub get_and_check_options { + + # Get options + GetOptions( + 'scheduler=s' => \$scheduler, + 'dir=s' => \$analysis_dir, + 'analysis_yaml=s' => \$analysis_yaml, + 'stages_yaml=s' => \$stages_yaml, + 'max_retries=i' => \$max_retries, + 'sleep_time=i' => \$sleep_time, + 'stage=s' => \$stage_to_run, + 'component=i' => \$component_to_run, + 'verbose' => \$verbose, + 'help' => \$help, + 'man' => \$man, + ) or pod2usage(2); + + # Documentation + if ($help) { + pod2usage(1); + } + elsif ($man) { + pod2usage( -verbose => 2 ); + } + + # Check options + if ( $scheduler ne 'lsf' && $scheduler ne 'local' ) { + pod2usage("--scheduler must be 'lsf' or 'local'\n"); + } + if ( $stage_to_run && !$component_to_run + || !$stage_to_run && $component_to_run ) + { + pod2usage("--stage and --component must be specified together\n"); + } + + return; +} + +=head1 USAGE + + run_de_pipeline.pl + [--scheduler lsf|local] + [--dir directory] + [--analysis_yaml file] + [--stages_yaml file] + [--max_retries int] + [--sleep_time int] + [--stage stage] + [--component int] + [--verbose] + [--help] + [--man] + +=head1 OPTIONS + +=over 8 + +=item B<--scheduler lsf|local> + +Job scheduler - lsf (default) or local (for testing). + +=item B<--dir DIRECTORY> + +Working directory for analysis. + +=item B<--analysis_yaml FILE> + +YAML analysis configuration file. + +=item B<--stages_yaml FILE> + +YAML stages configuration file. + +=item B<--max_retries INT> + +Maximum number of times to retry a failing job. + +=item B<--sleep_time INT> + +Time to sleep, in seconds, between each iteration of the pipeline. + +=item B<--stage STAGE> + +The specific stage of the pipeline to be run. + +=item B<--component INT> + +The index of the component of the specified stage of the pipeline to be run. + +=item B<--verbose> + +Print information about the pipeline as it runs. + +=item B<--help> + +Print a brief help message and exit. + +=item B<--man> + +Print this script's manual page and exit. + +=back + +=cut diff --git a/script/run_deseq.R b/script/run_deseq.R new file mode 100644 index 0000000..920f79b --- /dev/null +++ b/script/run_deseq.R @@ -0,0 +1,124 @@ +library(DESeq) +library(RColorBrewer) +library(gplots) + +Args <- commandArgs(); +countFile <- Args[4] +designFile <- Args[5] +outputFile <- Args[6] +sizeFactorsFile <- Args[7] +qcPdfFile <- Args[8] + +# Get data and design +countTable <- read.table( countFile, header=TRUE, row.names=1 ) +design <- read.table( designFile, header=TRUE, row.names=1 ) +numFactors <- ncol(design) +numConditions <- nlevels(design$condition) + +# Check design +if (numFactors > 2) { + stop("Too many factors") +} +if (numConditions != 2) { + stop("Must be two conditions") +} + +# Write QC graphs to PDF +pdf(qcPdfFile) + +# Create CountDataSets +cdsOneFactFull <- newCountDataSet( countTable, design$condition ) +if (numFactors == 2) { + cdsTwoFactFull <- newCountDataSet( countTable, design ) +} + +# Remove regions with sum of counts below the 40th quantile +# See "5 Independent filtering and multiple testing" of +# http://bioconductor.org/packages/devel/bioc/vignettes/DESeq/inst/doc/DESeq.pdf +rs <- rowSums ( counts ( cdsOneFactFull )) +use <- (rs > quantile(rs, probs=0.4)) +cdsOneFactFilt <- cdsOneFactFull[ use, ] +if (numFactors == 2) { + cdsTwoFactFilt <- cdsTwoFactFull[ use, ] +} + +# Normalise +cdsOneFactFull <- estimateSizeFactors( cdsOneFactFull ) +cdsOneFactFilt <- estimateSizeFactors( cdsOneFactFilt ) +if (numFactors == 2) { + cdsTwoFactFull <- estimateSizeFactors( cdsTwoFactFull ) + cdsTwoFactFilt <- estimateSizeFactors( cdsTwoFactFilt ) +} +write.table( sizeFactors( cdsOneFactFull ), file=sizeFactorsFile, + col.names=FALSE, row.names=FALSE, quote=FALSE, sep="\t" ) + +# Estimate variance +cdsOneFactFiltPooled <- tryCatch({ + estimateDispersions( cdsOneFactFilt ) +}, error = function(e) { + estimateDispersions( cdsOneFactFilt, fitType="local" ) +}) +cdsOneFactFullBlind <- tryCatch({ + estimateDispersions( cdsOneFactFull, method="blind" ) +}, error = function(e) { + estimateDispersions( cdsOneFactFull, method="blind", fitType="local" ) +}) +if (numFactors == 1) { + plotDispEsts( cdsOneFactFiltPooled ) +} else if (numFactors == 2) { + cdsTwoFactFiltPooledCR <- tryCatch({ + estimateDispersions( cdsTwoFactFilt, method="pooled-CR" ) + }, error = function(e) { + estimateDispersions( cdsTwoFactFilt, method="pooled-CR", fitType="local" ) + }) + cdsTwoFactFullBlind <- tryCatch({ + estimateDispersions( cdsTwoFactFull, method="blind" ) + }, error = function(e) { + estimateDispersions( cdsTwoFactFull, method="blind", fitType="local" ) + }) + plotDispEsts( cdsTwoFactFiltPooledCR ) +} + +# Compare conditions +conditions <- levels(design$condition) +res <- nbinomTest( cdsOneFactFiltPooled, conditions[1], conditions[2] ) +if (numFactors == 2) { + fit1 <- fitNbinomGLMs( cdsTwoFactFiltPooledCR, count ~ group + condition ) + fit0 <- fitNbinomGLMs( cdsTwoFactFiltPooledCR, count ~ group ) + res$pval <- nbinomGLMTest( fit1, fit0 ) + res$padj <- p.adjust( res$pval, method="BH" ) +} +plotMA(res) +hist(res$pval, breaks=100, col="skyblue", border="slateblue", + main="Histogram of p values") + +# Write output +res = data.frame(id=res$id, pval=res$pval, padj=res$padj) +write.table( res, file=outputFile, col.names=FALSE, row.names=FALSE, + quote=FALSE, sep="\t" ) + +# Variance stabilising transformation +vsdOneFactFull <- varianceStabilizingTransformation( cdsOneFactFullBlind ) +if (numFactors == 2) { + vsdTwoFactFull <- varianceStabilizingTransformation( cdsTwoFactFullBlind ) +} + +# Plot heatmap of counts +select <- order(rowMeans(counts(cdsOneFactFull)), decreasing=TRUE)[1:30] +hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100) +heatmap.2(exprs(vsdOneFactFull)[select,], col=hmcol, trace="none", + margin=c(10, 6)) + +# Plot heatmap of sample to sample distances +dists <- dist( t( exprs(vsdOneFactFull) ) ) +mat <- as.matrix( dists ) +heatmap.2(mat, trace="none", col = rev(hmcol), margin=c(13, 13)) + +# Plot PCA of samples +print(plotPCA(vsdOneFactFull, intgroup=c("condition"))) +if (numFactors == 2) { + print(plotPCA(vsdTwoFactFull, intgroup=c("group"))) + print(plotPCA(vsdTwoFactFull, intgroup=c("condition", "group"))) +} + +dev.off() diff --git a/src/quince_chiphmmnew.cpp b/src/quince_chiphmmnew.cpp new file mode 100644 index 0000000..d8a86ff --- /dev/null +++ b/src/quince_chiphmmnew.cpp @@ -0,0 +1,596 @@ +// steve qin. +// 07/01/08 +//#include "stdafx.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +using namespace std; + +#define MAX_LENGTH 500 +#define PI 3.14159265 +#define LIMIT 100 +#define MIN(a,b) ((a) < (b) ? (a):(b)) +#define MAX(a,b) ((a) > (b) ? (a):(b)) + +void readData(const char dataFileName[], vector &order, + vector &data, int *nRow, double *totalcount); + +double logIntPoisson(const int k, const double lambda); + +double logPoisson(const double x, const double lambda); + +double genPoisson(const double y, const double mu, const double alpha); + +double logGenPoisson(const double y, const double mu, const double alpha); + +double logIntGenPoisson(const int y, const double mu, const double alpha); + +double logIntTrunc0GenPoisson(const int y, const double mu, const double alpha); + +double logTrunc0GenPoisson(const double y, const double mu, const double alpha); + +void pathFinder(const int count, const vector &order, + const vector &data, const int chromosomeLengthInBins, + int *&path, double *&proba, double *&logproba, double *&hits, + const double totalMapReads, const double totalPeakReads, + const double totalPeakArea, const double medianPeakBinCount, + const int numPeaks, const int readcoverage, const int threshold); + +void parameterEstimate(const double *peaksdata, const int peakscount, + const int threshold, const double mu, const double alpha, + double *mufore, double *alphafore); + +double likelihood(const double *y, const int chromosomeLengthInBins, + const int threshold, const double mu, const double alpha); + +double unran(int *na, int *nb, int *nc); + +double gammaln(double xx); + +int main(int argc, char **argv) { + int j, numCycles = 1; + int count, chromosomeLengthInBins, binSize; + int readcoverage = 0;//added 04/13/08 + vector order; + vector data; + int *path; + double *hits; + double *proba, *logproba; + char inputname[MAX_LENGTH] = "10.select.chr21.txt";//"jy9.10.select.chr22.txt";//"sle.txt"; + char parasname[MAX_LENGTH] = "jy10.paras.txt"; + char outputname[MAX_LENGTH] = "out.txt"; + ofstream outPutFile; + double totalMapReads = 0; + double totalPeakReads = 0; + double totalPeakArea = 0; + double medianPeakWidth = 0; + double medianPeakBinCount = 0; + int numPeaks = 0; + istringstream iss; + string lineString; + double temval; + vector datas; + double totalcount; + int threshold; + + if (argc != 4) { + printf( + "3 options need to be specified:\n\tinput file name,\n\tinformation file name,\n\toutputfile name.\n"); + exit(0); + } + for (j = 0; j < MAX_LENGTH; j++) { + inputname[j] = argv[1][j]; + parasname[j] = argv[2][j]; + outputname[j] = argv[3][j]; + } + + ifstream inFile(parasname); + if (!inFile) { + cout << "Error opening input parameter file" << parasname << endl; + exit(0); + } + for (j = 0; j < 9; j++) { + getline(inFile, lineString); + iss.clear(); + iss.str(lineString + " "); + iss >> temval; + datas.push_back(temval); + } + totalMapReads = datas[0]; + totalPeakReads = datas[1]; + totalPeakArea = datas[2]; + medianPeakWidth = datas[3]; + // if(medianPeakWidth >800) + // medianPeakWidth = 800; + numPeaks = (int) datas[4]; + readcoverage = (int) datas[5]; + threshold = (int) datas[6]; + binSize = (int) datas[7]; + chromosomeLengthInBins = (int) datas[8]; + + medianPeakBinCount = medianPeakWidth / (double) binSize; + path = new int[chromosomeLengthInBins]; + proba = new double[chromosomeLengthInBins]; + logproba = new double[chromosomeLengthInBins]; + srand((unsigned) time(NULL)); + readData(inputname, order, data, &count, &totalcount); + for (j = 0; j < numCycles; j++) { + pathFinder(count, order, data, chromosomeLengthInBins, path, proba, + logproba, hits, totalMapReads, totalPeakReads, totalPeakArea, + medianPeakBinCount, numPeaks, readcoverage, threshold); + } + outPutFile.open(outputname); + if (!outPutFile) { + cout << "ERROR: Unable to open file: " << outputname << endl; + exit(30); + }//end of if + for (j = 0; j < chromosomeLengthInBins; j++) { + if (proba[j] > 0.5)//0.01 + { + outPutFile << j << " " << path[j] << " " << proba[j] << " " + << hits[j] << " " << logproba[j] << endl; + } + } + delete[] path; + delete[] proba; + delete[] hits; + outPutFile.close(); + return 0; +}//end of main + +void pathFinder(const int count, const vector &order, + const vector &data, const int chromosomeLengthInBins, + int *&path, double *&proba, double *&logproba, double *&hits, + const double totalMapReads, const double totalPeakReads, + const double totalPeakArea, const double medianPeakBinCount, + const int numPeaks, const int readcoverage, const int threshold) { + int j; + double (*logfnh)[2], trp[2][2]; + double p[2], p0, p1; + double ratio, compa; + double dif = 0, inside1, inside2, inside3, inside4; + int na, nb, nc; + //double lambdaback, lambdafore; + double sum = 0, sum2 = 0, mean, var, nsize; + double muback, alphaback; + double mufore, alphafore; + //double mufore,mualpha; + double mu, alpha; + int bgtotal, number; + double *peaksdata; + //- int threshold; + //ofstream outParameterFile; + + //outParameterFile.open("poisson.out"); + //if (!outParameterFile) { + // cout << "ERROR: Unable to open file poisson.out." << endl; + // exit(30); + //}//end of if + na = rand() + 1; + nb = rand() - 1; + nc = rand(); + // hits are for all 25bp window on the genome. + // only some of them are non-zero. + hits = new double[chromosomeLengthInBins]; + sum = 0; + sum2 = 0; + bgtotal = 0; + //- threshold = 6; + for (j = 0; j < chromosomeLengthInBins; j++) { + hits[j] = 0; + } + for (j = 0; j < count; j++) { + if (order[j] > chromosomeLengthInBins) { + cout << "read bins extend further than the chromosome size in bins " << order[j] << " "<< chromosomeLengthInBins << endl; + exit(1); + } + hits[order[j]] = data[j]; + if (data[j] < threshold) { + // sum = sum + (double) floor(data[j]); + // sum2 = sum2 + (double) floor(data[j])*floor(data[j]); + sum = sum + data[j]; + sum2 = sum2 + data[j] * data[j]; + bgtotal++; + } + } + nsize = (double) chromosomeLengthInBins - count + bgtotal; + mean = sum / nsize; + var = (sum2 - (double) nsize * mean * mean) / (nsize - 1); + muback = mean; + alphaback = (sqrt(var / mean) - 1) / mean; + cout << "background: mu = " << muback << " alpha = " << alphaback << endl; + //outParameterFile << muback << " " << alphaback << endl; + // double aa = logGenPoisson(5,5,2); + peaksdata = new double[count - bgtotal]; + number = 0; + sum = 0; + sum2 = 0; + for (j = 0; j < count; j++) { + if (data[j] >= threshold) { + peaksdata[number] = data[j]; + // sum = sum + (double) floor(data[j]); + // sum2 = sum2 + (double) floor(data[j])*floor(data[j]); + sum = sum + data[j]; + sum2 = sum2 + data[j] * data[j]; + number++; + } + } + mean = sum / number; + var = (sum2 - (double) number * mean * mean) / (number - 1); + mu = mean; + alpha = (sqrt(var / mean) - 1) / mean; + cout << "foreground (raw): mu = " << mu << " alpha = " << alpha << endl; + //outParameterFile << mu << " " << alpha << endl; + //double ff = likelihood(peaksdata,number,threshold,6,0.3); + parameterEstimate(peaksdata, number, threshold, mu, alpha, &mufore, + &alphafore); + cout << "foreground: mu = " << mufore << " alpha = " << alphafore << endl; + //outParameterFile << mufore << " " << alphafore << endl; + //outParameterFile.close(); + // exit(0); + + for (j = 0; j < 10; j++)//200 + { + double aa = logIntGenPoisson(j, muback, alphaback); + double bb = logIntGenPoisson(j, mufore, alphafore); + double cc = logIntTrunc0GenPoisson(j, mufore, alphafore); + cout << "j= " << j << " " << exp(aa) << " " << exp(bb) << " " + << exp(cc) << endl; + }//end of j + // exit(0); + + //+ lambdaback = readcoverage * totalMapReads /(3100*0.9); + // cout <<"lambda foreground = "<= 0; j--) { + ratio = 1 / (1 + (trp[1][path[j + 1]] / trp[0][path[j + 1]]) * exp( + logfnh[j][1] - logfnh[j][0])); + logproba[j] = log(ratio); + /* + if((j>=143890)&&(j<143895)) + { + cout << "path= "< 0) { + logyfac = 0; + for (j = 2; j <= y; j++) + logyfac = logyfac + log((double) j); + } else { + cout << "error, y is negative. " << y << endl; + exit(0); + } + result = log(mu) - log(1 + alpha * mu); + result = y * result + (y - 1) * log(1 + alpha * y) - logyfac; + result = result - mu * (1 + alpha * y) / (1 + alpha * mu); + return result; +}//end of logIntGenPoisson + +void readData(const char dataFileName[], vector &order, + vector &data, int *nRow, double *totalcount) { + int count = 0; + int temOrder; + double temVal; + istringstream iss; + string lineString; + double sum = 0; + + ifstream inFile(dataFileName); + if (!inFile) { + cout << "Error opening input file" << dataFileName << endl; + exit(0); + } + count = 0; + sum = 0; + while (inFile) { + if (inFile) { + getline(inFile, lineString); + iss.clear(); + iss.str(lineString + " "); + iss >> temOrder >> temVal; + if (iss) { + order.push_back(temOrder); + data.push_back(temVal); + //07/04/08 sum = sum + (double) floor(temVal); + sum = sum + temVal; + }//end of if + }//end of if + count++; + }//end of while + *nRow = count - 1; + *totalcount = sum; + cout << "There are " << *nRow << " nonzero counts." << endl; +}//end of readData + +double unran(int *na, int *nb, int *nc) { + double random; + *na = (171 * (*na)) % 30269; + *nb = (172 * (*nb)) % 30307; + *nc = (170 * (*nc)) % 30323; + random = (double) *na / 30269.0 + (double) *nb / 30307.0 + (double) *nc + / 30323.0; + random = random - floor(random); + return random; +} + +double gammaln(double xx) { + double ser, stp, tmp, x, y, cof[6], gam; + int j; + cof[0] = 76.18009172947146; + cof[1] = -86.50532032941677; + cof[2] = 24.01409824083091; + cof[3] = -1.231739572450155; + cof[4] = 0.1208650973866179 * 0.01; + cof[5] = -0.5395239384953 * 0.00001; + stp = 2.5066282746310005; + x = xx; + y = x; + tmp = x + 5.5; + tmp = (x + 0.5) * log(tmp) - tmp; + ser = 1.000000000190015; + for (j = 0; j < 6; j++) { + y = y + 1.0; + ser = ser + cof[j] / y; + } + gam = tmp + log(stp * ser / x); + return gam; +} diff --git a/t/analysis.t b/t/analysis.t new file mode 100644 index 0000000..c6a7112 --- /dev/null +++ b/t/analysis.t @@ -0,0 +1,490 @@ +use Test::More; +use Test::Exception; +use Test::Warn; +use Test::DatabaseRow; +use Test::MockObject; +use Carp; + +plan tests => 146; + +use DETCT::Analysis; + +use File::Path qw( make_path ); +use POSIX qw( WIFEXITED); + +# Compile quince_chiphmmnew if necessary +if ( !-r 'bin/quince_chiphmmnew' ) { + make_path('bin'); + my $cmd = 'g++ -o bin/quince_chiphmmnew src/quince_chiphmmnew.cpp'; + WIFEXITED( system $cmd) or confess "Couldn't run $cmd"; +} + +my $is_ensembl_reachable = is_ensembl_reachable(); + +my $analysis = DETCT::Analysis->new( + { + name => 'zmp_ph1', + read1_length => 30, + read2_length => 54, + mismatch_threshold => 2, + bin_size => 100, + peak_buffer_width => 100, + hmm_sig_level => 0.001, + hmm_binary => 'bin/quince_chiphmmnew', + r_binary => 'R', + deseq_script => 'script/run_deseq.R', + output_sig_level => 0.05, + chunk_total => 20, + } +); + +isa_ok( $analysis, 'DETCT::Analysis' ); + +# Test name attribute +is( $analysis->name, 'zmp_ph1', 'Get name' ); +is( $analysis->set_name('zmp_ph2'), undef, 'Set name' ); +is( $analysis->name, 'zmp_ph2', 'Get new name' ); +throws_ok { $analysis->set_name() } qr/No name specified/ms, 'No name'; +my $long_name = 'X' x ( $DETCT::Analysis::MAX_NAME_LENGTH + 1 ); +throws_ok { $analysis->set_name('') } qr/Empty name specified/ms, 'Empty name'; +throws_ok { $analysis->set_name($long_name) } qr/longer than \d+ characters/ms, + 'Long name'; + +# Test read 1 length attribute +is( $analysis->read1_length, 30, 'Get read 1 length' ); +is( $analysis->set_read1_length(40), undef, 'Set read 1 length' ); +is( $analysis->read1_length, 40, 'Get new read 1 length' ); +throws_ok { $analysis->set_read1_length() } qr/No read 1 length specified/ms, + 'No read 1 length'; +throws_ok { $analysis->set_read1_length(-1) } qr/Invalid read 1 length/ms, + 'Invalid read 1 length'; + +# Test read 2 length attribute +is( $analysis->read2_length, 54, 'Get read 2 length' ); +is( $analysis->set_read2_length(64), undef, 'Set read 2 length' ); +is( $analysis->read2_length, 64, 'Get new read 2 length' ); +throws_ok { $analysis->set_read2_length() } qr/No read 2 length specified/ms, + 'No read 2 length'; +throws_ok { $analysis->set_read2_length(-2) } qr/Invalid read 2 length/ms, + 'Invalid read 2 length'; + +# Test mismatch threshold attribute +is( $analysis->mismatch_threshold, 2, 'Get mismatch threshold' ); +is( $analysis->set_mismatch_threshold(3), undef, 'Set mismatch threshold' ); +is( $analysis->mismatch_threshold, 3, 'Get new mismatch threshold' ); +throws_ok { $analysis->set_mismatch_threshold() } +qr/No mismatch threshold specified/ms, 'No mismatch threshold'; +throws_ok { $analysis->set_mismatch_threshold(-1) } +qr/Invalid mismatch threshold/ms, 'Invalid mismatch threshold'; + +# Test bin size attribute +is( $analysis->bin_size, 100, 'Get bin size' ); +is( $analysis->set_bin_size(200), undef, 'Set bin size' ); +is( $analysis->bin_size, 200, 'Get new bin size' ); +throws_ok { $analysis->set_bin_size() } qr/No bin size specified/ms, + 'No bin size'; +throws_ok { $analysis->set_bin_size(-1) } qr/Invalid bin size/ms, + 'Invalid bin size'; + +# Test peak buffer width attribute +is( $analysis->peak_buffer_width, 100, 'Get peak buffer width' ); +is( $analysis->set_peak_buffer_width(200), undef, 'Set peak buffer width' ); +is( $analysis->peak_buffer_width, 200, 'Get new peak buffer width' ); +throws_ok { $analysis->set_peak_buffer_width() } +qr/No peak buffer width specified/ms, 'No peak buffer width'; +throws_ok { $analysis->set_peak_buffer_width(-1) } +qr/Invalid peak buffer width/ms, 'Invalid peak buffer width'; + +# Test HMM significance level attribute +is( $analysis->hmm_sig_level, 0.001, 'Get HMM significance level' ); +is( $analysis->set_hmm_sig_level(0.1), undef, 'Set HMM significance level' ); +is( $analysis->hmm_sig_level, 0.1, 'Get new HMM significance level' ); +throws_ok { $analysis->set_hmm_sig_level() } +qr/No HMM significance level specified/ms, 'No HMM significance level'; +throws_ok { $analysis->set_hmm_sig_level(1) } +qr/Invalid HMM significance level/ms, 'Invalid HMM significance level'; + +# Test HMM binary attribute +is( $analysis->hmm_binary, 'bin/quince_chiphmmnew', 'Get HMM binary' ); +is( $analysis->set_hmm_binary('bin'), undef, 'Set HMM binary' ); +is( $analysis->hmm_binary, 'bin', 'Get new HMM binary' ); +throws_ok { $analysis->set_hmm_binary() } qr/No HMM binary specified/ms, + 'No HMM binary'; +throws_ok { $analysis->set_hmm_binary('nonexistent') } +qr/does not exist or cannot be read/ms, 'Missing HMM binary'; + +# Test R binary attribute +is( $analysis->r_binary, 'R', 'Get R binary' ); +is( $analysis->set_r_binary('S'), undef, 'Set R binary' ); +is( $analysis->r_binary, 'S', 'Get new R binary' ); +throws_ok { $analysis->set_r_binary() } qr/No R binary specified/ms, + 'No R binary'; + +# Test DESeq script attribute +is( $analysis->deseq_script, 'script/run_deseq.R', 'Get DESeq script' ); +is( $analysis->set_deseq_script('script'), undef, 'Set DESeq script' ); +is( $analysis->deseq_script, 'script', 'Get new DESeq script' ); +throws_ok { $analysis->set_deseq_script() } qr/No DESeq script specified/ms, + 'No DESeq script'; +throws_ok { $analysis->set_deseq_script('nonexistent') } +qr/does not exist or cannot be read/ms, 'Missing DESeq script'; + +# Test output significance level attribute +is( $analysis->output_sig_level, 0.05, 'Get output significance level' ); +is( $analysis->set_output_sig_level(0.01), + undef, 'Set output significance level' ); +is( $analysis->output_sig_level, 0.01, 'Get new output significance level' ); +throws_ok { $analysis->set_output_sig_level() } +qr/No output significance level specified/ms, 'No output significance level'; +throws_ok { $analysis->set_output_sig_level(1) } +qr/Invalid output significance level/ms, 'Invalid output significance level'; + +# Test reference FASTA attribute +is( $analysis->ref_fasta, undef, 'Get reference FASTA' ); +is( $analysis->set_ref_fasta('t/data/test12.fa'), undef, + 'Set reference FASTA' ); +is( $analysis->ref_fasta, 't/data/test12.fa', 'Get new reference FASTA' ); +throws_ok { $analysis->set_ref_fasta('nonexistent') } qr/cannot be read/ms, + 'Missing reference FASTA'; + +# Test Ensembl host attribute +is( $analysis->ensembl_host, undef, 'Get Ensembl host' ); +is( $analysis->set_ensembl_host('ensembldb.ensembl.org'), + undef, 'Set Ensembl host' ); +is( $analysis->ensembl_host, 'ensembldb.ensembl.org', 'Get new Ensembl host' ); + +# Test Ensembl port attribute +is( $analysis->ensembl_port, undef, 'Get Ensembl port' ); +is( $analysis->set_ensembl_port(3306), undef, 'Set Ensembl port' ); +is( $analysis->ensembl_port, 3306, 'Get new Ensembl port' ); +throws_ok { $analysis->set_ensembl_port(-1) } qr/Invalid Ensembl port/ms, + 'Invalid Ensembl port'; + +# Test Ensembl username attribute +is( $analysis->ensembl_user, undef, 'Get Ensembl username' ); +is( $analysis->set_ensembl_user('anonymous'), undef, 'Set Ensembl username' ); +is( $analysis->ensembl_user, 'anonymous', 'Get new Ensembl username' ); + +# Test Ensembl password attribute +is( $analysis->ensembl_pass, undef, 'Get Ensembl password' ); +is( $analysis->set_ensembl_pass('secret'), undef, 'Set Ensembl password' ); +is( $analysis->ensembl_pass, 'secret', 'Get new Ensembl password' ); + +# Test Ensembl database name attribute +is( $analysis->ensembl_name, undef, 'Get Ensembl database name' ); +is( $analysis->set_ensembl_name('zv9_core'), + undef, 'Set Ensembl database name' ); +is( $analysis->ensembl_name, 'zv9_core', 'Get new Ensembl database name' ); + +# Test Ensembl species attribute +is( $analysis->ensembl_species, undef, 'Get Ensembl species' ); +is( $analysis->set_ensembl_species('danio_rerio'), + undef, 'Set Ensembl species' ); +is( $analysis->ensembl_species, 'danio_rerio', 'Get new Ensembl species' ); + +# Test chunk total attribute +is( $analysis->chunk_total, 20, 'Get chunk total' ); +is( $analysis->set_chunk_total(30), undef, 'Set chunk total' ); +is( $analysis->chunk_total, 30, 'Get new chunk total' ); +throws_ok { $analysis->set_chunk_total() } qr/No chunk total specified/ms, + 'No chunk total'; +throws_ok { $analysis->set_chunk_total(-1) } qr/Invalid chunk total/ms, + 'Invalid chunk total'; + +# Test sequences and chunks before adding samples +my $sequences = $analysis->get_all_sequences(); +is( scalar @{$sequences}, 0, 'No sequences' ); +my $chunks = $analysis->get_all_chunks(); +is( scalar @{$chunks}, 0, 'No chunks' ); + +# Mock sample object +my $sample = Test::MockObject->new(); +$sample->set_isa('DETCT::Sample'); +$sample->set_always( 'bam_file', 't/data/test1.bam' ); + +# Mock sample object with different reference sequence +my $sample_diff = Test::MockObject->new(); +$sample_diff->set_isa('DETCT::Sample'); +$sample_diff->set_always( 'bam_file', 't/data/test3.bam' ); + +# Test adding and retrieving samples +my $samples; +$samples = $analysis->get_all_samples(); +is( scalar @{$samples}, 0, 'No samples' ); +is( $analysis->add_sample($sample), undef, 'Add sample' ); +$samples = $analysis->get_all_samples(); +is( scalar @{$samples}, 1, 'Get one sample' ); +$analysis->add_sample($sample); +is( scalar @{$samples}, 2, 'Get two samples' ); +throws_ok { $analysis->add_sample($sample_diff) } qr/use different reference/ms, + 'Different reference for sample'; +throws_ok { $analysis->add_sample() } qr/No sample specified/ms, + 'No sample specified'; +throws_ok { $analysis->add_sample('invalid') } qr/Class of sample/ms, + 'Invalid sample'; + +# Test sequences and chunks after adding samples +$sequences = $analysis->get_all_sequences(); +is( scalar @{$sequences}, 5, '5 sequences' ); +$chunks = $analysis->get_all_chunks(); +ok( scalar @{$chunks} > 0, 'Chunks' ); + +# Count sequence in chunks +my $sequence_total = 0; +foreach my $chunk ( @{$chunks} ) { + $sequence_total += scalar @{$chunk}; +} +is( $sequence_total, 5, '5 sequences in chunks' ); + +# Recalculate chunks so one sequence per chunk +$analysis->set_chunk_total(10000); +$chunks = $analysis->get_all_chunks(); +is( scalar @{$chunks}, 5, '5 chunks' ); + +# Recalculate chunks so 5/3 sequences per chunk on average +$analysis->set_chunk_total(3); +$chunks = $analysis->get_all_chunks(); +is( scalar @{$chunks}, 3, '3 chunks' ); + +# Count sequence in chunks +$sequence_total = 0; +foreach my $chunk ( @{$chunks} ) { + $sequence_total += scalar @{$chunk}; +} +is( $sequence_total, 5, '5 sequences in chunks' ); + +# Test test chunk attribute +is( $analysis->test_chunk, undef, 'Get test chunk' ); +is( $analysis->set_test_chunk(1), undef, 'Set test chunk' ); +is( $analysis->test_chunk, 1, 'Get new test chunk' ); +$chunks = $analysis->get_all_chunks(); +is( scalar @{$chunks}, 1, '1 chunk' ); +is( $analysis->set_test_chunk(4), undef, 'Set test chunk' ); +$chunks = $analysis->get_all_chunks(); +is( scalar @{$chunks}, 3, '3 chunks' ); + +# Test constructing from YAML +$analysis = DETCT::Analysis->new_from_yaml('t/data/test_analysis12.yaml'); +isa_ok( $analysis, 'DETCT::Analysis' ); +$samples = $analysis->get_all_samples(); +is( scalar @{$samples}, 2, 'Get two YAML samples' ); +throws_ok { $analysis = DETCT::Analysis->new_from_yaml('nonexistent.yaml') } +qr/does not exist or cannot be read/ms, 'Missing YAML file'; + +# Test validating analysis +throws_ok { + $analysis = DETCT::Analysis->new_from_yaml('t/data/test_analysis13.yaml'); +} +qr/use different reference/ms, 'Different reference'; + +# Test summary info +$analysis = DETCT::Analysis->new_from_yaml('t/data/test_analysis1122.yaml'); +my @bam_files = $analysis->list_all_bam_files(); +is( scalar @bam_files, 2, '2 BAM files' ); +is( $bam_files[0], 't/data/test1.bam', 'Got BAM file' ); +my @tags = $analysis->list_all_tags_by_bam_file('t/data/test1.bam'); +is( scalar @tags, 2, '2 tags' ); +is( $tags[0], 'NNNNBAGAAG', 'Got tag' ); + +my $seq; + +# Set FASTA index +$analysis = DETCT::Analysis->new_from_yaml('t/data/test_analysis12.yaml'); +throws_ok { $analysis->set_fasta_index(); } qr/No FASTA index specified/ms, + 'No FASTA index'; +throws_ok { $analysis->set_fasta_index('invalid'); } qr/Class of FASTA index/ms, + 'Invalid FASTA index'; + +# Set Ensembl slice adaptor +$analysis = DETCT::Analysis->new_from_yaml('t/data/test_analysis12.yaml'); +throws_ok { $analysis->set_slice_adaptor(); } +qr/No Ensembl slice adaptor specified/ms, 'No slice adaptor'; +throws_ok { $analysis->set_slice_adaptor('invalid'); } +qr/Class of Ensembl slice adaptor/ms, 'Invalid slice adaptor'; + +# Get subsequence with missing parameters +$analysis = DETCT::Analysis->new_from_yaml('t/data/test_analysis12.yaml'); +throws_ok { $analysis->get_subsequence(); } qr/No sequence name specified/ms, + 'No sequence name'; +throws_ok { $analysis->get_subsequence('1'); } +qr/No sequence start specified/ms, 'No sequence start'; +throws_ok { $analysis->get_subsequence( '1', 1 ); } +qr/No sequence end specified/ms, 'No sequence end'; +throws_ok { $analysis->get_subsequence( '1', 1, 10 ); } +qr/No sequence strand specified/ms, 'No sequence strand'; + +# Check getting sequence from test FASTA file +# First 10 bp of chromosome 1 should be CCAGGCGCGG according to: + +=for comment +head -2 t/data/test12.fa +=cut + +$analysis = DETCT::Analysis->new_from_yaml('t/data/test_analysis12.yaml'); +$seq = $analysis->get_subsequence( '1', 1, 10, 1 ); +is( length $seq, 10, 'FASTA subsequence length' ); +is( $seq, 'CCAGGCGCGG', 'FASTA subsequence' ); +$seq = $analysis->get_subsequence( '1', 1, 10, -1 ); +is( length $seq, 10, 'FASTA reverse complement subsequence length' ); +is( $seq, 'CCGCGCCTGG', 'FASTA reverse complement subsequence' ); + +# Check getting subsequence outside size of sequence +$seq = $analysis->get_subsequence( '1', -1, 10, 1 ); +is( length $seq, 10, 'Negative start FASTA subsequence length' ); +is( $seq, 'CCAGGCGCGG', 'Negative start FASTA subsequence' ); +$seq = $analysis->get_subsequence( '1', -1, -1, 1 ); +is( length $seq, 1, 'Negative start and end FASTA subsequence length' ); +is( $seq, 'C', 'Negative start and end FASTA subsequence' ); +$seq = $analysis->get_subsequence( '1', 1_000_000_001, 1_000_000_010, 1 ); +is( length $seq, 0, 'Large start and end FASTA subsequence length' ); +is( $seq, '', 'Large start and end FASTA subsequence' ); + +# Check getting sequence from Ensembl database +# First 10 bp of chromosome 1 should be TTCTTCTGGG according to: +# http://www.ensembl.org/Danio_rerio/Location/View?r=1%3A1-10 +SKIP: { + skip 'Ensembl not reachable', 4 if !$is_ensembl_reachable; + + $analysis = DETCT::Analysis->new( + { + name => 'zmp_ph1', + read1_length => 30, + read2_length => 54, + mismatch_threshold => 2, + bin_size => 100, + peak_buffer_width => 100, + hmm_sig_level => 0.001, + hmm_binary => 'bin/quince_chiphmmnew', + r_binary => 'R', + deseq_script => 'script/run_deseq.R', + output_sig_level => 0.05, + chunk_total => 20, + ensembl_species => 'danio_rerio', + } + ); + $seq = $analysis->get_subsequence( '1', 1, 10, 1 ); + is( length $seq, 10, 'Ensembl subsequence length' ); + is( $seq, 'TTCTTCTGGG', 'Ensembl subsequence' ); + $seq = $analysis->get_subsequence( '1', 1, 10, -1 ); + is( length $seq, 10, 'Ensembl reverse complement subsequence length' ); + is( $seq, 'CCCAGAAGAA', 'Ensembl reverse complement subsequence' ); +} + +# Check getting sequence without FASTA file or Ensembl database +$analysis = DETCT::Analysis->new( + { + name => 'zmp_ph1', + read1_length => 30, + read2_length => 54, + mismatch_threshold => 2, + bin_size => 100, + peak_buffer_width => 100, + hmm_sig_level => 0.001, + hmm_binary => 'bin/quince_chiphmmnew', + r_binary => 'R', + deseq_script => 'script/run_deseq.R', + output_sig_level => 0.05, + chunk_total => 20, + } +); +throws_ok { $analysis->get_subsequence( '1', 1, 10, 1 ); } +qr/No reference FASTA or Ensembl database/ms, 'No FASTA or Ensembl'; + +# Check getting sequence from Ensembl database with explicit connection +SKIP: { + skip 'Ensembl not reachable', 2 if !$is_ensembl_reachable; + + $analysis = DETCT::Analysis->new( + { + name => 'zmp_ph1', + read1_length => 30, + read2_length => 54, + mismatch_threshold => 2, + bin_size => 100, + peak_buffer_width => 100, + hmm_sig_level => 0.001, + hmm_binary => 'bin/quince_chiphmmnew', + r_binary => 'R', + deseq_script => 'script/run_deseq.R', + output_sig_level => 0.05, + chunk_total => 20, + ensembl_host => 'ensembldb.ensembl.org', + ensembl_port => 5306, + ensembl_user => 'anonymous', + ensembl_pass => '', + ensembl_species => 'danio_rerio', + } + ); + $seq = $analysis->get_subsequence( '1', 1, 10, 1 ); + is( length $seq, 10, 'Ensembl subsequence length' ); + is( $seq, 'TTCTTCTGGG', 'Ensembl subsequence' ); +} + +# Check getting sequence from specific Ensembl database +# Get database name via: + +=for comment +mysql -u anonymous -h ensembldb.ensembl.org -P 5306 -Bse \ +"SHOW DATABASES LIKE 'danio_rerio_core\_%'" | sort | tail -1 +=cut + +SKIP: { + skip 'Ensembl not reachable', 2 if !$is_ensembl_reachable; + + $analysis = DETCT::Analysis->new( + { + name => 'zmp_ph1', + read1_length => 30, + read2_length => 54, + mismatch_threshold => 2, + bin_size => 100, + peak_buffer_width => 100, + hmm_sig_level => 0.001, + hmm_binary => 'bin/quince_chiphmmnew', + r_binary => 'R', + deseq_script => 'script/run_deseq.R', + output_sig_level => 0.05, + chunk_total => 20, + ensembl_host => 'ensembldb.ensembl.org', + ensembl_port => 5306, + ensembl_user => 'anonymous', + ensembl_pass => '', + ensembl_name => 'danio_rerio_core_69_9', + } + ); + $seq = $analysis->get_subsequence( '1', 1, 10, 1 ); + is( length $seq, 10, 'Ensembl subsequence length' ); + is( $seq, 'TTCTTCTGGG', 'Ensembl subsequence' ); +} + +# Check getting subsequence outside size of sequence +SKIP: { + skip 'Ensembl not reachable', 6 if !$is_ensembl_reachable; + + $seq = $analysis->get_subsequence( '1', -1, 10, 1 ); + is( length $seq, 10, 'Negative start Ensembl subsequence length' ); + is( $seq, 'TTCTTCTGGG', 'Negative start Ensembl subsequence' ); + $seq = $analysis->get_subsequence( '1', -1, -1, 1 ); + is( length $seq, 1, 'Negative start and end Ensembl subsequence length' ); + is( $seq, 'T', 'Negative start and end Ensembl subsequence' ); + $seq = $analysis->get_subsequence( '1', 1_000_000_001, 1_000_000_010, 1 ); + is( length $seq, 10, 'Large start and end Ensembl subsequence length' ); + is( $seq, 'NNNNNNNNNN', 'Large start and end Ensembl subsequence' ); +} + +# Check if Ensembl is reachable +sub is_ensembl_reachable { + my $handle = IO::Socket::INET->new( + PeerAddr => 'ensembldb.ensembl.org:5306', + Timeout => 1, + Proto => 'tcp', + ); + + if ( defined $handle && $handle ) { + $handle->close(); + return 1; + } + else { + return 0; + } +} diff --git a/t/data/test1.bam b/t/data/test1.bam new file mode 100644 index 0000000..9866c53 Binary files /dev/null and b/t/data/test1.bam differ diff --git a/t/data/test1.bam.bai b/t/data/test1.bam.bai new file mode 100644 index 0000000..0177392 Binary files /dev/null and b/t/data/test1.bam.bai differ diff --git a/t/data/test12.fa b/t/data/test12.fa new file mode 100644 index 0000000..b36ef7e --- /dev/null +++ b/t/data/test12.fa @@ -0,0 +1,400 @@ +>1 +ccAGGcgcggaAcGTcaGGGcGCttAAcgaAcaCattCAGTcatTtcccatcTTGTcaTaCtAaAGATTCtAGATCAcga +CcttCttCcGtctGAGGCcggcCGcAAataGGtgCGggcgCacTaagcTcCAtACCCGttctcgtgcGGCActaCAtgAg +ATCCaAcCgCGTCcAAGTAgAGAcCTgaCGTATgGaTatTCttAacATAcCtAtGaatAttaaTaATtaAGgaACCcTgG +ttTaTctcCtACTtCGCtCgCGTTGGctATgGGTGAggCcTagTGagatTaAtGgTtTAgGgcgTccCtGCAgAgCTGGg +GCcCaTGCcGaTgcTgaCgATgAgAtGaaAGCGaATgGCGCGcAAtTCcAcAAaTAgGCtGatgTgCcTAAcccCgtAcg +GaaGCtGgtcCTcTTTaGctGtCGCacTacTCacCAggCTggcTATtcaagAcCgTaTGttcTGcGtgcGATcGatCtaG +agtgAGCttCCtCTtTAcCacACgCgGacagaCtgGGTAAatAGcTAcGtAACtGagAtAtgCgTgcATaggaTCtGCgC +agtACGtggtagGaGaGaTACAGttTccTACTTGAATCatACgcgAGCGatCGGcCGTACAcGgTtgtgcGAcTatgCCa +gaCCATgaGCTCCCtGAgCcCCtcaTGGTAaAgAtCCgctcAGGTataAgCttACgCaGAGacCctgacaggtAAcGgaG +ctcTAaAAaCTACTgCcCTCGgCTAcAccTatCGTgAaTcaAGAaTGAaaacAaAagCaaGgcgGCCGTaATATTGCGGg +GTATgTagGTAcGGtGgcaTcTgTTCCAaaCCacCgcTgtCggcTGTaTgAAgaaTcGGAaTaTGAgtAGctgTgGaTaC +ggcATcgTacTGcTTgcTtGgAGCCGtCCggAATGGCCAcAgCcaCctCTGcGccaggTGtcTaTatgCAccGgtTatcg +CCTagGctCaCcgCgGAaGGGtTTTGcagcGaCcTTtgcccAgTcgctCATGaTCggtgtGTGACTCAcCCtTatGCacg +gatagatAcaTGATtaaGtcccgcGtcatAGCGTAggaTggAcGGgAgcTtaCgACCaGGTCGCtGTAgCaGCATTTaCT +gccgCgAtAAcAcGGATtTtTATaCTGgGAgCcaGCcAgaCggCtGcgGggagTGCctGtcTAgTGAGAtAcaAgtGttg +ACcTgGCTGagAagtgcctAAAAtCCcgatTCAcCGGtcgcaaCTatAtTGgCTTactgCActtaCTctaAtGGGgcATt +AcACcTTTTgTtCtAtTCccTgctcTCtCctGtatAaCtactTtacgtTtGcGatAgAGAttCccgtaaGCcgcTGCtcC +gACtCCcgCcTGaTgaAcAATgTcAGtaTTGGcaggtaaTCcCCGaaaGtgCTcaaggacggCCatatGagaaaAgGTaA +caAATCCGtatCcCtCTaaGTCgcAgtgggCccCcTAAATAcgCgAAAAaCAgaaAAGtTCCcgCGaTAacTCACtctaT +GgcgATGCttggAGCGAAgGACCCCattCCAGcCGtAcaTcGcGCAgAgggAaGAcCCgggAgtcgcTCAtgccCtCTgG +TtAagAcaCTcTCtTccttTCCAAatAACTCtgaaCTGacagCGTaagGttacCtacGTGtCCaACGTGAcATCagagTt +aacctAGatTAcGcTTGGaaGcAGgGCCtttccCCcGAaAGccTacTcgtGAcGcGTATAggATgACAtAtTtaAaaaCa +GcaCAccAcGTTCtcttgtaAgTTTGCCagTgtTGAtTtCTCgGTtAcCgGtgAcgcCAccActaAcGAtGgcaccgcGG +TGaAaGaGtaTCcgTgGgCAgAGAGtAggTTTTaTCaAtgAgcGtaAccCCcgCCTcGGTctattGaAacaTCCCCcGAc +cCgCCAtggagccCcAGcCaaGtACGcgTAcACTGgAaGAGtACGgTGgAaTaCaTGAgTTaaGTacttcGtgGcTccTt +CGCacgCCaTGacatgAcTTAtgatcGGaAgGGCTaCGgTtATtCagGtaGggtTGTtTGaatcgGaATTCGGCgtTccc +GccgcgcgcAGTgCCGCcgcAaGTCgGGcaagTTTaagACTGCAtGcGggAcCTTgaaaGtctGGttatTAAAaTaacCA +cTcCccccTCCcCCGAaaAATCTgTGgAaGACCcCAcaAggtaTgGActgGtCGggGGgaGgaTAtcaAcacAttaTCaC +AgGGTtcAAAtcGTcggTGCtcCaTAcACtgGGGTGgTtagcaaccgtGcgGTCCCtcgtAagcTTTaccgcTAgCcgTT +tgtTCcggaACcgcTcaCCGCtCgTttAAGgactggAaAaCAgTgCgAGtgAtaGgCaaGttcccTgTacaTTAgAaGTa +GAAtaagTCTTGCaTtaGGgTGtGgGCaGcacgAtgcggAaTcTAcGgacCctAgcatggCcTgTCAaAcAAaGgcgaTG +CCAtaTGAAAgTcCAaaCTGTATGGTctTTGAGGtCCAgTgGCaAcaGcAgcgaGACaAcCggCTaacTTcCcTgtAATg +gaCggtATaaCgAGAtggGgAtAaCCCtcGCgCgcCAaTcGctTtttTgtCTaggatTaggTGcAaTtCCcGCCgtaCgT +GtTTcCCGgAgATaCAAAtcGGatCcaagTGcAAGTaTttcACcCtCCCGcaaCCCgttAGtCtCgAgcGtCcaacgaAG +TgCatCCctGAtTGtCCagCTAGtcaTCgTCGcGAtaCGcaACACCCccACTcCGAtgCTGaTtAccTTAcCGcgtGCat +gTCtTTtcTTTCctGCGCGTtGaTATgATttAatCAacccgGtcccAGtGAAcCtaCTaCAtaCtgTCCTCGACtTtaca +gTcaGAAAAgCTAtaTtaAgaaGCcAttcgaGtATGCcTaaGttAATgcGTcccagaATaggTCagCaAgttaGGAGatt +tCAaCgCCGGacgCggccCaaTTTtgaATCtACACAagCtAgAatgCCCCTTggaTGGAacgccctAGtCtgtGaGgGgg +ccCAaaACGGtTTcaCtTAccCcTCtCgaagCGtTtAAgTctcAatACggcaGgCGatCCCCgTCcTTCcGgGgtgtAAc +CGtgGTaATAGaCcTCCGCATTtGCgcCAtTtgggCCtTtCtGtTaaAcaCAtTCtTaAggacAtcTATTTtcTGCatTC +taTtGgCtgGgttaagCTactgtAgccTCtTTCGgctcACTCCCaTAttAcTCACcaGctTaTGATaAtTCGgTCTttaT +aTtGaCcTGaTcctggctTtTTGACGTGctgTAgGtGTaaGTgTAttCATtcctgAcAACgTtAAaAggCCaTAGtGctt +cacctcTCGtAtGATCgCtCaAatccgCGcGgtccGCcCacGGCAgaTTcgGcacacgcacatTTCAGgtttcGctcCAt +tatTcAtcTTTgCaTgCGaGaaCAgaTcgAGCCgTCCcAGctCAtAtGtcgTcaTaACTTAtcACGCGTCAgTccCAaCa +CGcatTagTcagCaGTGcacTTCaaACgctgGttAagcaGtCTCGAAaattaTacGAAAGGacAAttTtGCTaCagTcAT +gCaTGtttgAaAaaaTcaGTacAttaAagatGcGtcaCttCTaCAAGGACgtTAgAaATctGtcGgAttGcTcCtcacTT +cGtTgcaCaGCtCgGCgAgAgatcaTgTGCcAACgTTtggaccCAGtgggTTcGagtAtTGgattcctcGTGtgTtAaga +AagAaTCAttGtTtTagGatCCgACCGtCCTaGACGtAgGaggattTgTAACCGTgaTCAcgCCattGtTTAcgTTggGt +gtACCcgattctcCTtGTGgGAAgGctagcatAgcCGatGaCGGCgGGGtcgcCccatcAtacCTcAgTagccgCGtTac +tCCtctTtcgAtGCTcGCcTGCaCTACAtGCCtgaTcCTaGaCCGaCACcgGTgatcTggCTACcggctAgCcAAGtaAt +CGTcGatcaActtCgCttTTCCccGttCCcgtaATTTCaTAGTAGcAAAgtatcgacCtaGGcCtagatCgGGgCGGCcG +tgCAtCaCTTcAttTttGAcagctTgtAaatCtaAcacCTcActTCcgttatAacGcGActaCgCagAACACGatcCtgg +ATgccTaGttGTTTgGcaGtCtaGtaTgTGCtCtatGTtcgTcATtTtcCCggTttgCTttGTcGACccAccTGTAGCCG +tCCcgCgtTaagTaccTGAcaGCCtcGTtattaTcaTGTCAaGTAtGTGctgaAGGttatgTTcGtgctTgtgccaCGCc +aGTCTGgActTcTTAaGgcaGcTGGATTAaCTCcggttGAaATATGGTaAGgTaCGgCTaaaTTCGaTCCACcCGatccC +ttAcGTgATctaGgaCtTtTgTTTCgTcctGCtgAAttCttatTtTcaCgTTtcGGCtGCCCttaAttTAcTGGgCgTaG +ccTtaaaggACatCGTATcGGCcccATtGggacTGAccattCTccgaCggTATAtGGGctGGGcaCtgactGgGcaCCgA +GTtAGagTcAcACAGTTCtACtTctaAccTgTaTtAaacgAttCaCCAcGGaGtagatCGCGCgaCAtaAgtCCcGgcCc +AGAcgCgTTGTaAGAgATTactagcgcGacGggGggGcGcTatgACAGCgTTcAtGcCaTagGATGgTGtGCaCTatctG +aTTTccCCAggtCCAgTAGAAGAgAcTTtCAgACggCTAtTCGGtcAAtGAaAtTtCCggtaGaTAtcAGCTGAgaAtcT +GaCgacaaAgCCCttaaAtaaaaTCgtccTacaCGaTAgGGtgGttgatcTtCaTTCttTGgATATGCaGccgAaacGaC +CCCctTgAGAGCttgCAcggcGTTtCggtgaGTtGacCcCGGgTCgtgcAaAGGgCCCgGAGCAaTgAgTtTATcctGgT +gTTCGgCgctcGTcCaTcgTcCAagCgtCTAaGATcGCatcaCCTAtTcAcGAATTcGatcgAtTGCgTTaTATgtAGTC +aGTttaCcGtcgtagAgaGAtCGTAacgtGGgtaATTCaGACtTtACTttTGttggTgagtGtccAcTtCaATaTTTTaa +ACaTGGATGgATcaTACGaGAaaCtcCtaaaAaatTcGtgAgGTgaTatcTGGTGCgGaCTcacTacGTTCGaAGTAAtA +aGGAGacAcCctATAtAaGTaCCagccctaGTtCtGctAgcaTcttATATAcaaCACTATtaaaTCAgACttatgcAaGc +CGGtTaATGACGAgtaGTCccacaggGCAGaccaTtCTAggCtAaAgttATgtCactTtGTaGcAtAacgCCtCACtAGC +TattACaGtcaATCCTGTgAGttttaaCcGggAtgTTCgacgaCGcGGcTTcatAtACAttCcgAtGACAtcgATTaAAg +TGGgCGcTCTctattGtgAAtcAaCCATaCGgGcaCTATaCGcgCCggAATcctCcgGggACctGTTtGccGTcGTgtTG +CAgTcCtgtgcaTTCAGAAtTttTTTTAtctCcTCACCgTagTTTcGCAagGtAtACgCTtGCcaaGtgACggacTaCac +gTccAAGgGGAatacAgtaggTAgAAgACGCGTaCccactAGTgaaTTgTtgaGctcTcCcccTttgggaAGtggCGgac +gctcCCttGACGatTTCgCTGCGAtCggCGGTttgatGgAgtcTaCTGtcacCCacgctTCgAAcaGaAtCgCcgTTtaC +gATcACTTCAtttaacaccgcATAcAAActGTCgagaAaGtGAgTAgTCaaTAAtccgagAGgaTCATcTtaGcacATCa +CaGGGAtAAAtTAAttGaaaCgCctgATcgcTCGgGtttTgaCGGctCCcCgataAGgCAgtaTaCaggtGAaCGaaGaG +TaCCTgGgcGgtCcCgtGTcccCTCTtcagcggctTCtacTcgcGagAAcACGGaCAGtgcGGaTGCTAAaGatCAaccT +aGaaAttgCGgtATGCcGGCAtcTcacctaCgACGgaTgtgTCCTaAATTAAaaTtaGgtcgCcAaGGtGCgtTGattCc +ccGCGaTttaGAcGagGGTgCGTGcTTCCAAccAagcgCaGTaatTggTaATCCagTcTGaGtGcaTgcggCAaaAcGcT +CCAactTCCatTcAgCtAaacTtGctgTcctTcaAGtctGTGTCgCcTaGGgTGctacgGGAGcgCCatGcTaGtAGCTC +atTACGGGaaACaCCtcTttCGccgAGAcCaTTgACCgagtAAgtgttTcCAATCtCgAggGTaCCcCggCcTaaTGccc +AtcTCttGaAGGgTCAcggtTgttAaaaAGCaATAGcaCCaggAAtATcCCgcAtACggaAgcCATtAaGGatGcAtCag +CcGAtTGcTCATaggAtcAaGgCtCctgagcTGcCGTCtaAcgagTcGGtcctcAcTcTGgtcAgcAggtctcttCcCct +aACaTAatGCggtTCgACCggtTatttAaCGaGGgtAtTtGCttCCTgacAcAcTCtGCaATGGAcCCagCgGtCTTggG +cgCACtGtaCCTcAcACCatGgTaAGGcCcCTcCCcTGaGgGTAcTCgcggggAgctGtgcgaTGTTTTacTgctGTagc +ggAAgagTGGcgGCCgcGGaaggtggtATTGCTTTtgTTtCaGtggGtcCcgaTCAgGaacACctcgGGCTAattCTGcC +gaacGccctctcgGAGAGATtGGGcGtatgACCaGCcactgaAcgGTaTAGcTTtCAgACcATGTTCcCtGtccATgacA +CTtCAcAGaAAAgCgCcGtatTCAcTatCGtggGaTAtATgactGGtgcgcTCaaCCAggTCgAatAgtGTtggAtCccg +cCcCGagtTCTtTtttgCttttAtGGtgAGGAggaaCTatCTCcAGGagCGtGTGtgtTCaTGCgccACtATATaaaATT +gAaaGtgAGtagccaaAaCGctTAtAaATtAgAaTACACtaAacGagTAGAtAtCTcAcgATCTgcGatTTgACaTtaCG +AGgTaCaGctcTatAcGtccTTaTagcAtTaATCAcaAgcTCCgtAccACTtgtggTGcCACAggAacTGgtaGcaTgGC +CGtctGtGAgCTGTCAaCacaAtgCTGTcACCAaCcggCagtcGcCcAcaTGcACcTCTcgCTGCcaTGcGTgaGcgACG +ttgtTCTaAattaaaCAGAtAACTGcCaAcGCgtatTtcgcgcAGAaGATCTtaCAGgcctCAcTATGtgCgccGtCCCC +GTACGTgtcAtGTTaATgtgtGgGTTtTCTCGaCAcGAcCatTgggGAcaGgcaGcaGTATGcatCAccCGtAggGCtcG +AaGTgttaTtCgagAaAgACAcGatgatCgAattCGGtAttaGGgagcTtgAgcGAAcCATctAgCAttCCTaTGtAaGt +TctgCcagGAgcCTAcgtAcCaGgCCCCaGCTcTaTGacGcaagGgggTctcTGtcgAAACgTctAGCcAGgagggGGTg +cGtcgAgtACTTCCAAACTTCcAAGcatTcaTaCgctCTcccGGaTtAGCTTCcTGCgACcaGtaCaCAaCGGGCCCcAa +TgaaGTagTTGgCTTcAcCaACaaCAAtAttGAatccgAgTggtCactTtgAacaCtACaggAaACGtAaTTTtTGTAca +AACcCtgcacGTaaaaaTtcttTctTGTCcAaGtCcAcgCaGGcgagtTAGGtcAaCcGgGGAGcaAaTTaTAAcgAACG +CGAActcggcCgGCaAtaagcGgAgACgagttAAGcTtttccgCgCgcTGCAActgGcGggAAcTtgcACTTacCTcCCA +gGcAgGCACcAGaAaccGaACctTtgaaTGGAAtATACGCgaCtgCtGTCAtacgcCTTtCCGCataTAAtGGCCGaGTC +AgTgAAtgtTaaTTCTagATAccGTaTgagGCaaTTaacCAataCatAgCGaaGGTaTCCatAGCCAcggTTcAgttTCa +GacTGaacCcAcCGACTAGGAtAtAcAaGcCCAgCGTcACcTcGCctAcTcgCaTGtAtgcTTTATCgcACAaCTTTgTT +actCtTGcTGtGcgCgtAACtctTTcCTtgTacggCGAGTAaaggGaaCcGtCgcTaAGACCcAaacGaGCCgccAgagg +TTtgATCAcTtCTTtaAaaTtCGAAaGCACTCcAcAtAgGGGCcaatgCtGcCtctgcgctGCAgGcgGcaGGtgCTCCA +cgtGaagcGctgGcaCTAtcAACgAgActtTtctTGcaCagATaCgcGcGcctTAcctaaAGccGaTGTTGgCctGttgA +actActtcAaatGcaAcTGCATTCGtGAtTtctacccCcATGaGTACCcaGTTcCctggCTCaatgttcagtCAggGCat +ctgACtTTtGtagTTGGCagctGgCtgTAAtCTTGgcGacCcGCACgCGtGTACGTTAtacggACATcCGAgaCGcTgtG +ccGGCcCGGaTATTGgCCaACaTGAtCgacaGagcggGGtCgGGAAGtaGttGAcTAAcAtGaGcgTCCAGcacCCgtag +CcgCtGcTGGctcCcCcGATtTccCGTcaAtttTtTcCCTGaaCtgAGacTCACTTaccTGgtCGcTGcaaaatAgTCcC +aaTttATGCtaAagctGCtgAgCaTTcgAgacgatcaggaTtAcACcGGCgCTttCtCCggAAgggacTaGttatGgAcC +AcatTTCtGtGctcTacATtaAtCTTcaccacAgCAtGaTGTTtcgaActCtttGTgAAAAAtAgCTcc +>2 +CtcAGcATgTaaGacgtcgtgagCgACCcaTAAggaaTAtTcAgcATaAgGCtgccGAagAgCAtCTATttTTaGTACtt +TatAggACCggCAGGtAcccTTgccaCctGGaCtTTAGACgaGagcCgttGTatTcActcTccaGAaAcTcttACtGtAA +CTCcgtTAAAaagCACccgCaGCaacagCtCTgCAaGccCtgCTGGacgtaaAAacccGAATTACTcctgaCaggagcAc +CTGTGtTTaTCCGtaGgTCttActgCaaAcgATCtAgcCagctcTGTCAcTAAgAacgCGCCacacGTgAagAGATgGcc +ATCtTggtCCCcCctaCttGtgGGAACTACagtATTctCTAcCtcTATaagGaAaAGgcACaatGggACTatCcTGGagt +AaTgcAaAGCaGTCaTtCCaACgTACggAgatAGctagcgCGGGGCgCcCAaGaGGGTTgAttATagcAtGGTCGgtAAt +gtGCAcGgATtgaggGTgacGtGGACtCCAAGTtCgGCCtTcTcaACTGatcTTGACccGATTTGGTgGcgTgGTTaACC +GgaTGGAGgCAaCGgaTccgGGaTaacacCaaaACGggCAAAGagTTgtattaAttCGggtGTtTcGcTACtTcACCGcg +TcTgctAgtccTaaGgtAAgtCTTtATttgaGACCcCAgCcGgGgTTTcTAGgGCCCATTCgCTTgctgTCGGgCTgCGT +aaCgCtAAagGagtgtCaCgAtatGTCATgtataaAGcGCaCGGGgcTCTaTTaTAATcgGggcACTaAgGTAaTCcaag +AGcGCcgtgGcGtTCGTtaagACACtAAGAGTcCagtCaAGtCGaCTaatACATtaAAgGaACccCtgAtctCAGAagGa +GactaGgAtgccGcCcaGcGagagcCGtTTCcCcatgATCCCcAggcTACGTgATaGGTGagtCgtACcggcGaGAAAGC +gtggCTcGgCgttTggcggCtATTTaTtcttCtCgttgggCTTAGGACtaagAGgtCACtTGaGggCtTTtTACcttgAt +tCgcAgAtATAGgacAATggcgACATtTtGGTtTaTGTGTAcCtAcTaatACcaCTtcgggtTaATgcTAcAgaacaGtg +GTAcTAtAGTcCaAcaTGgGactGaCCCaTActGGcCagTGtttgATCtctTGacTttacaCtCaCAaACGtGTtgACTt +AtTatCtGAGTCCatTgtCAGTaaaGTTgCcctACaCCCtgCtaaTtcggAaaGtccCgtcAccAcTTgTTcTGTtaTaC +tATaTCacATAacATaGccaagTgtCTcTcTGCtTAGcGTgaTATTcGGgCcGAtgTgcccGTgGAAaGCtGAGCttaCT +GtcACcGTAcAtTGCCGAgTTTTtGtCcGGcctAaataaccGaTtcAcctAaaAtcagTggaTCccGgaTggCAgCCCga +ggGAcAACaTGgcgCcCtatCcGgcCCgGcgtCcaaAtCttCcTgtCGtgcaAggTATgGatcTAgcTcAtagTaaCCct +taCGTgCCaGgCtgAGacGgGGcaaaAAtTggTcCaGTGccatgagCttgTtAaCTgACtcccCTcCCgAaGccGGGatt +tgaAgaGacGgATcCTtgcTaAATTATtGTgGTGtTACgctgCGTGCCTGTcCTCGCGTtTAACTGcTAGcGTAttcCcG +CtaAAgATGaGCgggcgCAAccCtCAcCATtaGCaCCTTgcGCtACTtcaCcaTAtGtaCtGcTgCTtcgacacTgAGAc +ggCgtTagtgCaccggGaAgAcagggGcAgGGGCCggGCTGaTGgCtgGTGgtAtGTGgAGCaCgGGGtgAcGagAacGa +gTcaaGgGtatCgAGaCtGGtagcctcAccgGGggGCAtcCtgtgATGcAtCatGAgAgaAGCTgcATtaAAAcGTcctg +cGtCcaTaATAcgAAAgtTggAtTatGCaTcCttAgaGGAggATcAtgttcaGttAGatAAcTTgtAacGAcatTgtCgt +gAattcgCgtgataCGtcggTctCCtAactgTaccGcgAcTCCTgaAccACcaaGcGCTGgatTTTAGGACTctGcaCCa +AAaTTtTAgCgCatgaAgccACCgTTtCACtTGTCAccaagtTaGAtCaCTgttTaTccaaATGGcccCCAcCCgctggt +tTGTtTaGaATtTctcaAcTaTGagACcCgaacacgctCaTCGaCgTATgcCGGtCctcGGggtGAGTggtAcAGTTagT +caTtcTgAcaaCTcgcACtCTTTgaCACCTaGTgATCtaAaCtTCgcGAgAttgaCAagAacTaGtcTatATGaAacCCC +TtTGGTgCTAggCAaaTCCcTgAgAgaTAgTCCataGaTTCGTATActtGtCaTatCGCaAgaaTAtTgAgcttaatGTc +gaTGTgCCgGtgAtAtccgTCcTtcGaGCgcTaTtGTaGtagtcTGcTCgTCTACGtCTaaTTtGGGGCgtCttcTcaAA +TgCaGaAggtGttTGacAGTtGTCttTAAGatGggaAaggCAcgtTAtcGCACaAtcGACcaAcaaAGCGTgCAccgTAC +gaGgCgcAGcaGGGtATTtcCaAAcTAatACaAAGATGTtGtGCatGgGGTgTcAAgGgTCGcTGgtgGgGcgGtATCgt +CcgcgcCcAGaCaAgtGaagttaagCgTTccTcCActgcTgGAGAAggtgccaTCTtGgtaAtAataTggaGcCTttATg +aAActCTgGTACCgTtCCTTgtTttCGGattTCtaATGtTcCacCAactgGGTAaGcTGcGACGTcTtCaaGTACgcAGa +GcGTGATcGGCaCtgtTACtTTcCgAGCcCCcAGtagAaACagcttgGatcgTAtCACctAACaAtCgGGGGTCGgcacc +ActCTGTAGGaGgGAcACCGaAtaTgCaTtgCatcGTCTaagCCGaCatGcaaaAatTGtAacCATtcGAaGcAActcCG +aAtAgCGaTGttaaGGCGaggagatCcaGtgTACaatgtAGtggtatggatACCttgtGatagatTAggtCtaGACcTgt +CtttgGAaGCgtgcaTGTTaACTgATacTAaTGagAAggAgcAgCGGCAgCATCtTGTagCCaTcTcaAtcTTgctgggG +CtTCAgCaACtGcctgTTTAacGGGcAAcaATCtcAAGtggGAaTGGgAtccCaAAgCAAgTTtcacCgcGaacaTGTAc +TTgCCtgcGgcTaTtCCatCaAcgaaTGtTAACtgTtAGtgtCTgAaTcGCAcCAtaCaAcAgaTAtctcCAAGacGaaA +ttTctaaCgCaCATaAcccGgCCTCtgGCgaGaGACcTcggCTCaCAACAtTtTgtaaTGCCcgagcTtaTActCccAAt +CcAGCtACgGtCGAgTGaAaTTaagAcAaATagCaTatgTTGCCActaGcCtGTGatGGtgacAggAtAcatGTtTcAGA +aTGCATtAtaCgGCTCCtaagCTTtGattAtcAtCgGATTCCcCattgtccgTcCCcGtgcgtGaGTTacaTtacAaTTa +agTcaGCATTGacGatCaaAcGgAaTGAttgacgGCCCAGAcTTGCCCttGactCAACtcCCatgCatGCTcAGtGTagG +ccCcCtatgGCtCgGaGAcatGggggTtATTtgCCtCTTGGGACgGaggCGACCtGaGAGCTCctccGtcCgaGTgtGAt +TacCgccctaAgTctTtCgTgTCcaatAACgTcaGggtTcGatCTtCACctAtTggaataCgCtTaaGaAaccCgCaGaa +GGTaGaGggcgttacaaTTAGaGgctACtGcTccCGcTtcgGggcatCtGctgcCAACAAGtcTgAcaacgtAtcACGAa +GaTgATaaTtTTAGCGCcgAacaTGGGctGAaaccTGCgtcAgCtgGTagGaTtTgtGATAgcCcttccgctcAaagatT +aGtaAacctatCCTGTagGCtgGCgGGcgaaATAaCcTaaaGaTttagattcaAGGgCcAtcGTgaTaatTGTGtgCctt +cACcACACTtAAgGTcGgaaGcAatcgActCtCGgTTAGacGGatgcgcTagAAGagcCCCaTCGTaCGCTTaCaaGagT +GAtTAGatGCCcGaTTAaAagggACCcAGTccattaCtATgTTaATTAGcgAtaGGAtCaAACagGtgAAGgcCcACtca +agGTagagGatgAacCaTCTAAggAgCagAtGGCTtgGCttGTtGccAtgAAatGtaAcCAGTtgGCAacACGAcATTAA +GgtCCaAgtacCAGCAGgcaTtgaGCCGggaCCaAGtcttacTTcttCtcgCCatCtcGACaAAACaCggcAcgacACTC +AttcaTGAggttAaagCtaaagaCAGAAGaaCTAAgAGTGtAATTtTGgtAcGTttTAGTaCTgaCaTCCCAGGcCCAGa +aatActAagTtatgATGtCCccGgCCTtatcAgGGCAtgGgCGACtAATgCTTAAGGTTCatCGtcaacATcCaactGgt +TtATcgCgTGAcGAGCATGtAAaCAccTGCGccGtTaAaCcgCCttGAgaGTgcAgaGctgGGgGcCtAgtATCcTGcgA +cgGgAaTaCgTAcctagAgTcGCGGgttcAgggcCgAgaTAgGgCgAtcAAgTCcAcGATCaCCgGCgatgTActtttAC +CCCcCGGAAGCCAaCAcatgCtGGaCttCcagttAtCACaTtacggTGCGcttgaGTaACtCctgGtTTGaCaAcTTTaA +gACTCACGtTTTtGagGgatTcCtAaGgCctCTGTactCTcaGGgcAGCAcCAaTcGGgTctAgcactTaggGaAtcaTa +GttAtGCccAtagCTgagActggccGAAAcTggGtGATAgtaTGacTCAaCCCgAATCCccgaGagcgTGaatGAGcTaA +AGgTaTatcAAgtGGGTaGGActGTaaTgtCTCtgAaCGagGcGgTGCAgCgttAcaGAagTtTGcGtCTCggAAactTa +AAccaGTatGCATACGtgagaTgcGcCTctCCTCCgacAagTGcgaGAGGatattGCATAgTaGgaCgtgcCAAgTCagT +TgTGgtcatcTCaATgCacaTGtTttAaCAacctctAagacgAagATtCgTaCGtCGttcTGTAagATtaatCCTCttTG +cAtTACaccacGtcGgAAgCCGaGGaTCaTtTtgCCACcGgcACAcTCaaCCGAATGCTATataTTCgTtggatGGtTGg +gcGgatAAGaaCAgggAactAtGAGaTTggActAGGCtTctTcCTgaAcgtCcAaTcGtCttCggatcCTtaCGACctCG +AtcacAAGTAGCGTttccTgTgtTtGgtGtgTTAcaTGtaaCgCctCgtAcCgCaactTACcAAtCtTAtaggGTGtAcG +AgatgCCGcACaTTtGtTGaAGAtGTgatTAccTTAgAtATACccgCtaGaGCcgCTAccCCgCGTATacctaTTcgcAC +GtTgGCagccACTttGgAGttgCCGTTcatTTgAGCCAtttTTgttCAcATCgctCTaaacAaCGcAtCagAtCaGGGAA +TacgAgtGGTgggGccAcgTcCAtGCcgTCTcCGgcaAacAcaGtTCtcgcAaATacGcCaATTCGACAcgcaaaTgatC +GtccgttcGTAgACccgtCtaCTAgtcCgGGCTaGcGCccaATCCAACGCgaAGtCtacTgCtCgttgGACgGccaGgTT +GgCTTcggGGGAGggaAGgGCGtGGcgtgGACAttTAgaAcGaTatcCgtTAtAgAactCCGcctaGAcTgAacGgttcg +acAGggCaaCtCaacGTCTagAGAgtAACGgACATgcTGCTgaaCTGactgtAtAataGacCAgGaActtGTGGagCATt +ACcAccaTGctATCtCTacTgTCttCTGTcAtgatcGGtttAaAtcTGGccAcCtgTccGccAATAaACCGGGtgCCTGt +ctgAAGTttACCGtaAACaGaCTtAccctGCttaaaTacCagctagtTcagccataacgtGgATACtGAcTaCaAAaTgg +TGaCACtgCTaCGGtaGGgAgaACcACCTgttGgGaGcCCACAaAagtAcgtcTCCGcggTTctaCGACGATTcctcGGa +gaACcTaTtGacTGTgccgAcGGGAtaTTtTCCCtcAgCgTCgaTgaACTtcTtacttatgaCTcATcgGttCgTCGtga +tacAaGGtgaAGCggAgcGTCacaGATcgAtTTTcCcctAaactctTTCtcCGCaCaagaCgcTcgTcGcATTTCcTTca +aggaAgTtCcttCAaTGTCTATcCAcAGtGCtCCaAccgaGTccTTcAGcaCCgaatgTATcttTaCacccAaTaAcCgg +tccCTtTtTttagAggccagaCGtttTTTGCtaGaTGtTcTCttTACACtcTTtcCaaTAgGCGtCtgagcTCaCCCcga +aCAaGTtaagAcaCggTCtGgTataCtTgCTCactGTtGCcgtcGgGAaTAgAcAGcctCCTaAtTgaTtTTGGtccCCA +tcAAaTTGCAcggaAaTactaTCGTcgGTGAGCaTCgAAtcTCAatCaCgGGccTgcGCGataCgcATAATccTCCGaTa +AcGCAtaaACTGgCCgAaactCgtAACtCGAgTcacacCctCGAaGaTgAacATaGaTcACGTgAgGTgAcGTgaCcgCG +GtCAAACGgGaTcgCaCCGAgCtcTTgaCCatcAgctgaGTtgAAgaagCaCtcTGgtCcTgCTagTTTaTccaGCtggg +CttGAGCcGtaTCGAgCGaTTATcgCgctcgGGAGgCCAgCATCTgCGctaggagAcaATtgGgcGGgtTtAtgCgTCTC +TCTACatTCacaaCagATTATcGCggACgCGttTcGGCtcacctAaTTtaTGcCaGAGTgagCcCaCCtAAtAgAcacaG +GGGcGagaTGCgaatcagactTGCAGtctTTCgatcTatGTTcgtTtgAttgACcCaTCAGgcAcGagCaTCccgaaCCC +TaTTGggAcacCTaTCgGaaTtgCGaCTTCtCGgGaaAtAccggCTAaTggCaATATGAtTAaaCCgcCtACaCaCtttG +gaAgctTGaGtagCgtGCGattatcTCtgttgTgAgaCTctCCcgGAGGcgCtAATCgcactcTtCaAAtTCTggtAGTT +gaaacGgaaGGaCctaCtcgAGATaAaacTaagcgtTCtTggCcgcTttAaGcatcCcgaAcGggcTcCgTGTagaTgAA +ttGTAAtgAcccGGGcCTgaaaCCcCCTcaAGTaACttaTTTAcaAGatcaCATagcGGCGacCgctctcCcAtGATGgc +gaGcGtggtTaTCCTgttgTaaATgcCgacATGaGCcAaCAgtcTAccgAacGgGcGGGgGaTggTtctgtCgtgGataC +aggTTcAcgctAAAaGAaacaaGgTcACGCagGagaTCtcgATcCAggTGTgCaAaacAGCCcctaGccTaAgtcAaAgG +AcAcCttCTtcttCCaGGATAGtAccAGATtCTAtatcATaAAaTgTCctgtcgcgcTAtAcAGcAcTtcgAgagGcGgT +ataTGCtGTGttCggtcGcATAcATtacGtaGaCagTcATTGAAggcgCcGCcCtACtCgTgGgaGCcATAActggaGGT +tcCAtTttTTtatTgGaAAAATAgGaCAGgtgGGcgaGataggGCtCcaTTGGaTcCgaAACAagAgATGgggcAGATCA +CTTcaCaTGCaCgtcgAatcTaaAtTgtTTTtGAcAtACacTgCGTctctGttActCTCaTtcgtTaCACcGtAgtAttA +ACAcATCAaCtCGGtggTAtaggaCCgcCtgAtcCcAtaAcGtTAGcagACaaAGggACTcTagacAgaCAgGatatTAT +GttaAAccaTGgcCcaAtccATAcAtgctTgagagCTgtAtATGAgTTatGCAcCCtTaACCggATggtCAGAttcGtat +CttcTgCAccggCCGCGaAgttCTGcataacCgAtCGG +>3 +agCgctggTtTaGTCAATaGGcCccATcaCGcGgtatccgaAcaCcTTctACtTATTGcGACcCGtATgcccaGacgtGT +TAgTATcgCgtTtGGTccAGGGACtCAgaAGtagCtTCCgTGGCtaATcTCgCGTTTtgcAGAaTAtGGGGTaTagaGAA +cagccAcAATTCTGGATTtTtcTAaCtcgacctaAcacGgtaagggACagGacGcGccCTtTtTTCacgAcaGGCgcgAT +gCTgCgcCgCAgCctGACAtCGGAATTTAAttAGAaGCtcCGtTgACgaAATTcTGGGCCagAcCgAcaaGcTcCGTgCA +tGAgtCccCgACgAtctgttgcggagCTTTTaTaaggCcTGatTAAAtcGAttGtttTTggtAcTaGtCcaaaccAgGAt +cGgtGGCgtTGAtTTCTaAGgAGAagCGtaAcgcCcGTggcAatcatCtcAtcCcCCGAAAccTtccAGAAGCaAcGatg +TAaATTACtCTcgAgaagATCAGgcAagGGGGatTtgatCGgCGgGcCTGtccTTAaTActCGTatgTGgACCTTtcCAT +gAgTTTgCcCgTccatcGgACCaGtAggGactgaTattgcaGAattTcAcTCtctgtgGTTCTAccgATTaaAcCgTaGt +tcGTtaGttgTTggtTggTcGGAAAgtcTgCcTCCtAgTgcTcCCtcAgcTaTaaatGCgTGTcTtCTtCAATcAAaCCg +aTgaCAGCGcgTGGATTTGTaGGgAggGGGTtgGTcGgaTtaACtcGAcGACCCAagActtTcCtCCTatAGttAcGaAa +cTtGCGTgTacGtaccGCCAgaaGcacgTcCgTTAtgACGTCTGCgACAaTGAGtttatgATtAtAgcCcCtACaGCcgt +AAtAGCagAGTcaaagGCATcAaaAaaACGcgtCATtAcgtcaCttATTCggaGcttGCTtattaCttCtTTgcGTgccA +TTTCgTtttCgTaaGtccggcagAggaCTGGCCaaCTCtcACctTGGTtCtccCGcGtGCctaTGcGcAtGgaGgtCCcG +tGtCGAcaggTCCgactaggTGGTaaAAtCAcCatggtAgcCGaaTtAgatttcCGACcTgTTtGaAgTCCTTgGGATGA +agtcGatCtCCcTGGattGgtcAAcGACAaaGTaAtCgGtgacGgGgTgcCtcacGaCggCACAAgggGtCgactACCCC +gTTCtCgCTgAGcAgaTTAgTCTCtAcgcaGgaaCAccccTCAGTtCcGtATcggcGCCgtATgTggtagGtaAGGCaCt +aCGCTActgGCcGggTCGAtagacTctggcGAtaagGGggaccGCcCaTgGATTAGTaTAtgTCcTtTAgAGTcttCgaG +cGcGAAACGggaAATCtTCAcTggAGaCaTttAtCGGccGagCtCGACCGGCCgcttAtCGCcTgCCgtCTTAgaGgAaa +AGaGCtCGAgtgTtggcaAcTccAtCtggCATtaTTgTCgTaTTTCcGGctagTCCGTgtCgACCAgAcGaaatCccAgc +GtgcTGGTaAGggCGtGAgtaTCGgtAACAaTGATATCAtggtcaaAtaAccggCtTTgCCTgaaggaaTgcGTtGTTtA +tgCTTtcTGtTGcAttaGgAGaGGactATtcCGttGgcAcTGaACtGGgAacaGCTtctgcaccgaTACgTATtgCaGGA +tgcgccTcggataTagGcAgCACgaGGACgCcCagACGAgAtgACTcgctgGTCcCTCCCcCAgctaACGaTTACcggCc +gcCtGcGGCactaAacAgTCtGtACctccgGAgGcattataCtAGTaaggatTCcCATtAaAcCcCtTAaCTTCccCaGa +CTCtggctAaTactctgCGAcatTTaGaaGcTttccaGGCTtcgCgCgcacgtcCTTAtCCgAacATTggttATTGtcaT +gtgagACagtgGGaGTaaAGtGccatcgCtGaTActcATcagcGCgTAGGtaacccGTtaGtaataaaAGgGGAcGcCAT +AcTtCTTGTTCCTcaGGTgCAgcCCgAACCGgctTcGtCAGGtTAGcCTaGgGcTGGggAtacTtGCGCaAgtcccGgat +AtgCcAtTgGaGtCAcccAaCtTcGTATCtaCgCCCTCccCgGcACAcAAatCaGTgAAGgGgcgTcCcAACcaGcatGC +ctCtTaTGTgATGacctGCggAAtGTGGACtTaGAtCgGgtCCggtCAgGtGtTCGCGaTAacTTAAaCcCgCCcttTcT +acgGAaCAAGgTaagaGctTAAaTGtcTGgAtGAaAGgacCGCGcgAcGAAgcTgCAcCgCgCAATACtCCTTCtaGgCc +AaGcGCGGGgcCtGTgtGcaTgACcAAaATgtGaGGTTGggcTcCTCgttTcaTccTACTagctctATGCcGcgATaGgc +GCaagtcGgTTtTgtgTCTtaTTGAGccCaaTGgAAtCTTGtCAtCtgtTAtcGcgGggactAGGTtCgCaggtTtaAaC +GCctccgcGAtgagCgtcCGTTCcATGAtaaGAgGCtAcTagaTaCGggTaGTCtCAaCGtACCaGttAgGcCTGtGGGC +gGACgtcttAtTCgTtTGTCGCtGaGaGTtcTGaGGGAgaGGgCtGcatatTAtGcGgcTgcgcaCaGGaTtTgaACGCt +tcGAtgCgTGTcGAgggccAcGTCTgCgtAaTaTGggTTgCATCtAAcGcAaCCaACtAgAgCCgcGtaGCtccatgAgc +tGTGataTtccacAACaCGAttCTGCAtTCGGAGgcgAAacACTgttTCTGcTtGTCcacGaATgGcTaGGcAtTaTTAg +aAGaCaTtatAGgACCcaCgcTcCagAcTtCctgaaTtTgcCtcCCcAcCAtcaaCaTAGgtAtCCCGtctTATggacTa +tgGcGTGgGatGTcTgAtaGGcaaTcgtGcCAGAtCCAtTtTCcgcCcGtaCcgCCAcAGgGCAgTTATCcctgctaTgT +ATtctgaGAcgaagCCCTGAtttgTTTCCacGAgGagAAACAtgCAggTGTcTaTcctCTCAATcaTCatattttCggCG +aGttgACTaGcTTTTTTTGAGgGGAcTATgCaaAaaggGcCcAAATTAGacAcgAtgGCtAatTatCCcTaGagTaGTac +GagAaGTtTGCgTactagaCTaCCGgTCcGACCCgAaTcGGctagAcGcGTtTTCtCcTaaGTgtGccgCcCTCtaaccg +CcGcGtgtgGCTggTgGTatAggCgaAtAgCTgTAGtCgcgggcaGCcCCAACtGtTTCCAaTCGaGagtaaCTGcaGAG +TGgTgAtTggtaCACgAttaCtgAtgGaTGCAgCAGATTCGaGTcTGGTctCgTgGcTgtacAGActCaGgCGACCatcg +aaaGgTAaAgTtCCCctGCAGgtCtgGGtTcCTgtaCtTGgcGGGcaTcttGCGCgGcgtTAGcCgCCCttGATCAGtcg +CtAtGtTaGGtGCgAggaGcacAagAagAgGTAATGtTTtcTGTtGCTCTAgaCCgAtGtcGgCtTCaCGaatTgCGgGt +gtCCtAacGTTCaGgaaaGgaGgagacacGCTTgctTTatGaATcGcCAgGtaTTCCgCgaATgCttAtGTttgatcCgT +GCCaccCACGcTaCGgcattTacgGGcGCcaCtTAgAaattgGcTCAcGTtcaaAcTACTTGCATtaCGATTTcGcAgaT +TAtAACtTTaagTtagtgAGaTTATATCAttGCTgTgACAatcgAtCttcGgAACgaacTAAcTtaGttCatTgAttggt +caTaaAGCAAcTGGcATGtAtcgACAAatCTcgTGAagcGcgAaGaAgAGgCaTcaaCTtAGatATCaCcCgaaGatGAt +caAagAtgTCccCTaTaAtGTAGCtAGCCtgagtGgtGtGATGgTttgcggTtcTGcAtgATCGTTCgTTaGTTtaAGCA +gACAGTTctAGtaaaCaatacTAATtccttagAtAtTCTGgCtagGgtctatgTagatgTagattGCCaattGAtTatga +AccCgTaaTCaCaCgAGgTtTCGTTaCgtgcAttgtcAcctCTaTttgctTaGttGgGtaAAgagGGGgtcGaatcTATg +TacatctaAcCGATTGgTgAGccTAgaTGgTAccAaaCCcCcAtAGTGGCtCTgcagACCgTtAGtTgTcTAttTattgg +GgCCtaTgtcATaAcAtAtacataGtGaTtGgttgCgtcAgCACgAcgATgCAtACtaGACAGtggAAcTacTGtaCaTA +TtCtaGTcgCGAgaacCCAGgaAgGAcaCTcTtTCaACGCgAgttCTaGgGTATTaTTccacAAtcAagtATAtAAaatt +acAgCAgCGAATaGtaAGCAAGcAtaTaTtTtTGGTGGtggCatTAgTAatcaGgcTGTGTAGgtatTTCgAgGCAggaa +GGGcCgctAgtCgAGTTActGTTcAAATCgAACTCttTCGGcGGCgcAtcTggtAaaaTgTacaAAcATggCaGGaCTcc +CTAGaTTtAGagaAcaaGGAGgAgggCtAaacgGcGgGttTctgACccTaACaCGTtCacgaTCCttaTgGCaTgCcCAc +atTACGtCGaCttGTCtacCTgAgATccaTtagCAATGGAAGCgtggagTATctaaGtgcCttatGtcttagTacGGGGC +GAACctaaCtcacgacTcgaaGCAAtaAAtCCcTAGGAtatCGTgaAacATactTgcCgggGtaTaGCAGggAaGAtaac +GGGGccGGaaAaggCTCggGTCGTCtGcGcCTgcTgtTaTAgTcCGaTgCTttGagggatGcCtCTCGagtAAtCgAacG +gTtTAGtA +>4 +GaAaTagATgGAATAAAgCTCtaCcctAGtatCgtCGAccAGgGtGAcTcCGTCATcTatGAGgATAgtTtGaTTcGGcG +gggAGGgtgcaGGTCTCCcaaTCaTaTCCgtgGCatTtcTTGCTTtaAAaTAgCcTACctGCgAaatCCgcGCAGCgctT +CcgGtCCaTtatAGGGttAGcCtAcAtgtCTGCGCtTccatAtGTCctgttCCttGgcggATcgcTAAACGCAGaAgACa +aGaTCtcCgActgaGTagccggTACtCtGgttgTcccTtcATcgcAGcgAcCacAaatAGGttcAacAGacCaTCccGaC +GCAcAGCtaCtaCCTgTACATAttgtTaATttCTcaaCtCCCGACGTtCcgaGCgtaCAggcctcgTACCaGgTCgCcCC +CAggCAgTCCtCTtcgGATaTccCggTgggAcTcAgGACTCTtgctcaaCActaAtcaaATTTAgctgGAgGCgTaTCcc +ataGaGTATtTATAcCacggAcgcGTTgtCcAcGtagacAatAcaATTcGAAaACATCCcTGCaAgGgattAtgcAGcCt +TgGtcgtaGTgTgCaGCcaAtacgCgttGacatgcTagTTTaGCaAAtGcgGccGggCtATAgAGaaCTGtagCGATtTc +gcTtTaGTgtcgaCgacACTCaCtcaTTaTcGCCGGGTTCgcagcCAcTtcAGtAgGTaCtATCCcaAAGACagGccccT +gGCaCgTcAccCGatCGTCtTTCTTCGGgCGgGCAtGcGGActCGaTCAtCcCctaaCATAAGTtgcCTTGggccGtTag +acAtGAcgCggGaGGATATgcggtCaAgaTggCtTGgCGaTGGgAagtcGATgctgGcACagGAcCTCcaTCAttCtgTT +CgaTAtATTCgagAAAAcagACTtGtAAtcGCTtccgTtcGGGCtgCcaagctgcTAgCaaCcCGcCaTaaCggGaAAaG +GTGgaggtTGtccaCaAacGAcaTtGaCCATCacTcGgGTttgagctcgGCGtCGcataTAccCgGACTCgTCACTTGGT +acCtTCGGcCGgcTggCAaGTattACGGaGCacggcATATATtGtACcggAcccCtAtGatatACaActtcgGTataGAg +GCtggTccGCTCAcTATcGagtagcGAgcCAAAgTcgCaGAgCcAGTctccCTTtTaAttTcACGtCggGcAaaAGTacC +cGgcTGGGcGTgCTgAACAtAAttGtcAgCgcctGTACgaCaTtGATTAAggGCgtTATTtatTaCTGggcTATtATGtA +CtAAtAAAggAgaaCagAcgTgCcCaAcgGTgttcGGgCaGgAAttActatAGaCaAgaAaCCcTCtAaAACGtgAgcGa +AaTccgtAGcaTACATaTAtTaCaAGGGctGGaaGCAaaccGcGgATTcaTGtGCacAtCCGtcGaCgGcGAGcatAtcG +ctctCCAAgATcgACacacCttggACgAgtaataATgTcAcGAAggGCGtgcTAaCTATGCAgTtgCgGTTaGgctATCT +gCAaTTTtGCcaacCCtgagGTaCAGgCcGaaCGgcCaAgGCCTtTcAGCGCTgGttCaAcGcaCTccGTCGCcaAACac +AGAAccTgATTggAtgaagGcGCtTgccAcCCCaaTtTtcgCaTttaGctacTtcCcGcaGTaTaCAaGTTggGtCCGgc +TgCGcgtcgtcTaaaGCatAaCTGaaaAcCAgtCAaacGaTtcCTGTtatacTGacAcgaaATTGctGActaAGCaaATc +cgaGCcagaAgGCtcagaCCCCgGaAacAacgTATTATTaAaGaAgAcGtTTcCAgCcTtGTttacCtTtTtATCcttGt +aCAcgGTgTAGgaCgTCTCaAACgTgTttCCgTCATtAcatCCcCcGCcTtggCCaCttCATtGatAtGgAcTAAAAcCC +AAagcaAAtgTGCGttGttCggggtTagtACgAccCTtGTccggGGcgTtagCCCcGGAgAaatgACCagAgGaAtActA +CCgCCGcGatTacAAtGGgTctctTTcctGctcGaacCCCTtAgAcCgagtcaGtgTcccTCACtgCcgGaTTGcGtgGT +TgtGcaATcaCtcatAAgTCAcCcGaAggcAcAtTGTatccgCAGAgTTGAacGaTAAtaCcggTggcAAcaCTAaTctC +GctTAATGCatcgACcAgcCtagtTattgGAgCTGcgATATActAgCCtATagAGcCAGaCcccAgCAtACGGCGtgtac +CGGGGGgGcgatGaGtcaCGaaAGaGcTagTctctTGGcCTgTcaggttgtggTTctcCttAgTTCGAGAGtactaaCga +AAGaagtCCCccaccgGtgttccCTACcGcGTcTCttAGTGTTtAagactgCccACcgTTgagGGGCAAAttcTtgCagt +ATTAtgGtAgaGTtAtataaGaATTcttgCCtgTACcatccTtgacTatTagttGgTaTggtcGcTCCCCTaCAtaactT +tctATAcCgCgAAACTATTtACTCTATTcgCAGgTaaTGtcattAaAAaTAGCCgtacggGAgctccAGAGcTGcCGTat +TCatttgtcGTaaTTGTtaTAaAatcgGaaaCtctcacCgtaAgacGcTGcccGACTTtTCTgACAacaaAtAAaGgGCT +gcAGcGACAtcaCcCaGTCTCCCtTTaGtCcGTGGCGgtaAcGGCGAtCAAaCCAATttTGTgTgtTaCcATGATgaTTt +CTatGaaTgCCtgtGagcgccTAcaTAggGtGATTTgaAAaCGACcTcgtctGGAtGtCcGcTtCcGaCtGaAcGtttCA +ATGTgcTGgttttaAAGctaAcCAgtTtcCaaCgTttaGAAgCGTCccagTgtgaggTGCctCGcTgaaaACtTcAGCTA +TGtcgCtTTatCcCAAaGcaccaGtacAtTAaAaatgGCGATGggcCGtGCCTgCGTcTTTAGagtCaCaGcgaAgaACg +AaGAttaacgtagccgGgTtGTgCTcAtGggtGTgaCtCGcttcCGcActGcggCAgcCcAATctaAtCtCggtgCccga +ACcGCaAAGcTtCAacGTCcAAgaacgTgGtTTttaCCAcTCCCGCgaCcgGTggaTGcACAgTGGtacTcCCCTgaCGc +GgatcCtgtgaCgTataGagcGAACTgATGCTcATTaTctGtTTgTggCAaGGtCgAgtgATGGCGtGCCaTGAGCGGga +TcaaacGaTgGcTatcTATTTCaaAttGaTaTactACcCACcGagagcACgCAAGcTgcgctcgTGaaACtaaTTtgAtT +TcTGTgTtAGaaAAGgGCGtAACcAACGCCgaGCCTCcAcAtGaGtCaTtaGgGTaAAcTtaCAGGGaGtggActatCAT +CcaaatcAAgGAacgATTgcattATtgAcCGaggAtTtgtgGAGaACCTgttCtgACCgtgCcCCgGAGTtcTgAaaTaT +cGTgAcaGGttACaAGgcaAaCgaTaGctTTCtcttgCCCcaatAAcgAgcATttatCgctcActtGCGtcACAGTtagA +CCtAcTGaggGGAGgccGAgtacCgCACCcAacGTAGCtTaCcCtgtTTCGTcCgtaGggTCgaCtCCaTGgCCtCccGt +CtGtTaCgcGTCCGAGaTCcATAGGTAGgGccgAAgTGgcCGTtgTGgGAtgCgAgGCGgAGTTCcAGAggGCtAActca +GgTATaaCaGgCCgcAGgctTtCcaGgAacCGcAcGACCcGgaGcGCgCAGTaCGgaaGgtaTGaCTtagGActCGaTGT +TgTCAtaAGatGtCgggAATtcatgtGttccGcGCtCTTTaGtGGACCaATGAgCACTCtTTTAcCATTgGCTtgTtgCc +ctTttcGCccCctatCcggAcATaGGcaACcggcataagCctGccCtcgACGgTTCgaTGCTTAtGAcACacgTgCAtAa +gcTaTTAcccaggAtGTCGgaatGatTCAgAGACaTCAAgCATgCTGGCaGaCtGgGtcCagtAgcTtgtTGgcgtcgcA +agacaCgCgctGTTTttCCTAgctttGgtgCAtGggtaacGtCgCCgCAGtCgtctcGaTaaggctaTtTccGgAaagAC +CtAGGAGCaGCgagAcaTTAtctAgCCTcGTacGGTCtcCTATcGctGcCgCcCAtGtTGgaCgCTtAAacTCGAcCCCa +TAaAAGaagggagaTCaaTCgATTcgTgaTTcCTGCccAcTCggacaGataGCcGTtTTgAcaTGTTCacaGCgAGAgtA +CtCtgtATCtTTCAGCTGCtaaaAgTAcGcgGctgacccgTgcGAACcGcACAgtgAaCTgGGCCtaCctGcAAAATAGC +CgAATCCTAggtCcAtGCtaaagGcgCcTTtatTTcActcCCGTTgacAgcgTccTCGccGCgTGgacCgactCacaacC +GgAgAaAgGggAtGtAaCGgtCgcTcACcGccGgaccCCgCGgTgCTaaCGGTtGtTgatCgCAcTAGccTgGATtAaCT +GgTGAcGaaaTCacatgGAtGGACgCAaGgtaCtaaCaGCGcTGcGataGgaaacGCGGCgggCtGgAcTACccGgCaCg +gtAaGtCTgGcgCcCtgGCATACcGtActCGcaatcAgctGTTTGCgTcGCACGAGCgGTAcCgCaGAGTCTTtAgCTAg +tagAcGAaGcCTACGtCcaTgAcgGgcAgGgagAGAGTCATGAtaCcgcGcagtTCatcTaccAcgcCCgCacGgCgcaG +AAtAAagTtcGTGtGttCtGcGgcTgcAgTgggtaGgaGcaATGtatcgcaCcGcgCcgGAGCcCcTGGTACCTGtGAgT +GACgttgCCgcCcAgGaGgcACTcAGTATCgcAGaattAgtcAAtcagGCgGcCtACTccaTccGcCtGgTATacATAgg +catactCTACAGcggaAGGAgcctTGCGaGaGtAtcGTAaActTAtACagAAGTtcttTAGcCcgcCCTGacTgACtGcG +ATattGtAGagTaAATgCaAGgGTAgtaaGGAgacGgactTcgCcCtgggTAcaAgccATCgccGtctGCaTGaCCtCGg +CatTGttcActcCctaTGAaCaTtCaGacctGTCgaATtgcgTAgCcAGAaGatTCgTcCGttaacCAGAtcTAgActcg +CgtCCTcgAgGatccgttaACTCTcCATgagGCAcAcGcGgCGGcAGTcCgTccgtaTTTGtgAttgCCCCACcgtGGcC +GgaaTtcGatGttAccAcgccgcCATaTtGGCaTaCtAACGccccTGATCtAgtTAc +>5 +gatATGGGTgGgAGatCgGttaAATgtCaTTccctCgccaGagTAaGGtCtTcAcggAAggGTTtTGTCGttGTtAgAtg +tatGcAAaCcccgGAAgcacTCccGgTccTAaGcATcAAcgcgttgGatTcAttCaAtCtAtGTActTAttTACATtcAA +AgTGtatCgaACCagaagATttCGTATCCAatCccATcGaacAGAtctCcgaGgGtcacATgGaCACCTTcTaCTGATca +CaagtttAcTcCCAATcttgAGtcAtaAttCTaTcATACgcAaCAgGtcTCCgagTGacaACACggCatTgTAtACtaGg +TgCagccGcgTGGGCaTtAtCaaACgAacCtgCTaGCGgcGaGgCcCagttaACgTTaaTATggtAAcGCgcacAGCaGG +AcGtTttttgggggcTCtaTCGccGgTgTTGTCGgAAcAGaggcGcgTaTtctCAatAaaATaTTgcactatTaTgCaCC +CtACaAATCTtCagAATtcaggTtAGaGgAttcgatatCtaaatTTCTgccaTgAGaggAAcgaCgCcGcAaGATgCGct +TcATaGTggcCCcgtccGTaagcTATTTtGtAgCtTCgTCACCGGtAgcaTtgTaAGcCGaGtgTCATatGAcaaccCAC +TGcgATgGgCTaggtaGcCgAcTGagGcGCacAgAGAGTcaAatcGGGaTTcCGcCCAccgCcAGTTAcgaaatATtCCA +cttGGGCCAggCATCaaAgtGaCgGGccGGGCgAgtgAATGAtcagaAGcGacTCtCTccgcCtGgggAagTTgagcGaA +cgcAGGgATataaTcgAAatctatgaCAtGcaAGggtgtcGGaTcaTTAtcCAATgtCGtcTAACtGGatggCCCtAGaC +atCaAccCCgTCgtacgtgTgacGaGcattGaGCtTCTCgAtTgcGgcCGCTCgaTgCgCaacCGccGAggTcCtTcTga +gcTCGTTAtaTttGctgTAcCTaTcAaCggCTggaTGGggGAcggAGtTCAcaCTaCGCTTcccGtTggAaTttTcGAcG +gTtGaTgCtATGTttCGgGAaTcgggATtCGtCCAGagTAAcaAGCgACAGgTaACGaCgcAATtACagAccgCgaGTct +tcgGCATaaaacAcgAGcttCGgGcTaaaTtcagGCACAAGcGActACGaAaTAcTaTAgAAaAAgGTcgGcTtcgcagC +acTtcgGcTCCgAAAAAatTGacgGGgTaCtTAaGGACctcagtaccaTAgCTGTtACgAcCtcCCgCgCTATgCacGaT +ActACaTgTAgGCcCgGacCGcTgGGtTACcagtttgcATtTGGatGGgcATGacgGCgcGgAtcAcAtggaTtTtTcTT +aGGCcGACGcGtTaggaTGaGacCGTcGGAagTaCAAAtttccaTACctCgAtCGcGCCtTtATagCAAcaaaAAcTCgA +CacTTtGGCtGcCgccCctcCTgCctCtttGgAggGATACtTcCAaTgtaattgCagtGGGTaCtaTTGaaGaTTgTGgc +AtACCtCCcaCtgaTcAgaaGacctaGttttAaGAaTCGAtgacgaTGCTGtCGcgtcgAGTccACGcccAggtcGAgTC +tcttATaaAagTagtTCTcgGGaCACgCcaAccCgActTATgGaTGtTATTTcgCAgGattCTCCtAcaGaaGAAccGgc +AcgATagtCGTaTaaaCtaATtaaACaggATTaCtACccTccAaAACCACTctTGTTTCTcGGGActaagGcaGCgGTga +aCtATtTCTTtCAGTaCGggCCgcaAAtGATCCgTgAAAaCccAgtTcCagTTAAcgAGACCcggCtCgCCTaGtTTtAG +CGAcGGGTaAGtaAgATcGCgttATCtAacCtaAAaCgGTAAAGggttAATccgCcCtacagGTAaaCAACCTgcTgGTC +cGaACgtgcTcCCcAaTAaCCTcCacTgCtcTAcCCTcTtCTGcGtCCacaCtTGgaACGtTtGGaGatgTAGCAgaaCG +aAActGAcccttTcactggTgttgtAAGcCtGtTccCaTGaCgAAtCagctGgtTACCGgcAcGtaCGctCCTGaTAAgt +cTcTATatGACTaggaACagAggAtGaGACgCtgTCaATCAGATtacGaCcTCCgtaTGCgtTATaaTcCtACgaCcTaG +CgGtTTaaGctAtgtggaggtcAtcttGCaGCaACGATcGttTcTtCTtTctgaAGaccaAacGGaaACcCggcgaTAgT +AtCtATatgtaCaGgTttcgGACTCaAAAgGaGAgTttTTTAcgCCCgATgCcagaGTCTAAaaGtCTGCaCCcacGaCT +CCCTACttcAttgGAGAgCCTtAtgGAacgaGcATCCaTcGTgtTCtcCtCTTctcCcAAACCgCCagCGGttCcagtac +tcTagCTAcgTgtTgCAgCAAttGTaggAacGaCAAACtTgCagCacAcaATcTGTTccgGgtGcaaACTTGTGTgACCG +CGGCcAAtatTTCGgtGTaTaGGtcCCTgTGaaCaaGtAttgCGaTaaCaccGgttGATAaggggtGTCattagtTgTtG +aaAAGAcAgGGCGgTGGTtGcCATAgGgCcgTcttgtCTggaCCttGTAAtAAGGctaagGaagtTgGtcgAcAtctCCA +CAaGGcttTGaCatcaaAGGTtGcgaAgAcAcacCcTtcaCgcAtCgGgcTtGGgCcAcgCATtcCtGgTcAtaGtgGTG +GaGaCAGcATagcAttaACaACAACGcgggACTgCtaAAAAgaggAcCcActTCATagaAAgGCAcGGcaaAAtatCgaa +AtCCGTGtgCgcGCACgGAGATTGaAAcGCtTAgaGTTTACCtgaatTTAaaGGggaaTGcgGTtttcaCACGCggCtTA +gccCtTTcAAAacTTcggtgTAaGGatctAgTTcTaATCAagCTtGggCAtcgTcCCCAagGgAggcCTaCaCACCctaa +tTGCcTtaatTaGctACcTggAcaAtTGcTtagcgGcGGgctTcGcCGgttAAAcTTAAgGccCgCGaCtaTtcGAgttA +GCTcaAgacAaGaGCTGtgtaTCCggaccGaCAtCGcggcccACtAtcGCaTcCATtTGGaAttCaCcAccAcCCgatAT +TGcaGtgCAGaTaCGAGATattGgcGGTaGCcTTAcgCaCAaGGccgcCaCTcCGccaTgtGCCTCtgtcggCGtAgTAa +TAGAagGtCGcGACAcTgAccACcTGCctGatTaTGtaCGgACAGGGTgaCccCGGtCGaGtgCAccCGtTtCGCTtaAg +tCgGGttCTCcgGAtgcAttCTGagTaaAtgGgatCAgtccGgcTGtTCAgAaCATGttagtccTAACaccCGgAAactg +ATcTttCCtcAaggGacCgGcTaGtCGcGgGatAcCtaaTcgAACCccctgaACccAtgGGaACACCgtcATatgAccCG +cCtaTGcTcCagctcaCGAACgccTTcCAgctTtgCgTAAGTcttTaCtGtTtTgCGTGGAccgGAttTtGGcaTagCTa +tgGgcgTcGgAtAGgCCCaGAgcggCgggCggCgaCGCGAcGCAtaTGAaTtaGacGtGtACtccGaGaTGcTGtGcCGa +CtTACgCAcccgcTCCttaCTCtCAacacAgcCACctCTtatgTtaacgcTTGGtgtCccgtactCctgtTgtagAgCga +ggACccGCatgctGctgCaAtATTAgGgGTTaGgggctgaTtTTGcaaACGGGagtGTGCCTAtGgTatGgCgcTGtGTg +tgtagGGGtCGCGACTaaCgCCcGaCAaTAcTTtGccCgCctGtagGGCtggAaAgcGTCAtgagGaaGGTatTGCCAgA +AGCCTgcGcTggtgcGTTttGATCGgaAgGgAccctgAaTgtTTGttCGttacTgctaCCTTtACGCTcatTtttTTcCg +caTaCtTcGaAaacgatCtAAggtAGtaAgGcGTtAaTgtTTAGGCatGacgCgTCactCttgaGAacaaTCtTctAggC +gCAgTGggCCcTTgGTAcCGGgAgAATTaggtggAtaTgCgAGcgAatGaAtttcAGcAtCgTAcgGaCcccAcGtCGCa +CgTTtcctGGctgCtcCTTCAattaGAcTGGtcAcCgAaAgGACAaCacTgccTCTtttGgTcACaTcTgGcAAAgGttC +cTtAGCcacCtAATcgaAAtctCGTtcCACaAAGTTGatCTAgtaGGgtaacAcaCtTaTgAggAAGaagtAGCCacgTG +TgACTtggTatCCAgtgGCGaTgcGGggcCGGGtgCTGGCtTcgagtgtCaGaAgagaCCacGCtACGtCAcgAcGtcAa +AtTCGtTCgcCActtaggGAACTgGaCgTAAacGtCTcGgggTATtTCGGAAAATcgccaGaAtcCAacAaTATAgGaTA +TTCcGagCTTaTCCacTgtgGAgAgtTgGgAgGCCcTCacAcgTccATaATgATGACccGTCCGtGGgCCaCCtgtcgac +gctgGGAaCtAtatgGAAcGGGGaTTGatActtCgGCgctTAcGAGacAaATTtTTtGttaTtcCggGTTcGgTTtcTtg +gGtcATcgaGcTGctAgGatGCGtaTaCaTat diff --git a/t/data/test12.fa.fai b/t/data/test12.fa.fai new file mode 100644 index 0000000..93a5ab9 --- /dev/null +++ b/t/data/test12.fa.fai @@ -0,0 +1,5 @@ +1 8789 3 80 81 +2 7958 8905 80 81 +3 4808 16966 80 81 +4 5257 21838 80 81 +5 4592 27164 80 81 diff --git a/t/data/test1_1.fastq b/t/data/test1_1.fastq new file mode 100644 index 0000000..d8e3b08 --- /dev/null +++ b/t/data/test1_1.fastq @@ -0,0 +1,400 @@ +@HS3_9090:7:1008:11310:354/1 +TGGCGGAGGCTTTTTTTTTTTTTTTACATACAACTGTCGTCATAATATGCGCGGCCTTGTCATCGCGGTGCCCCA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1438:2003:100013/1 +CAGGGCGCNNTTTTTTTTTTTTTTAGTCGCAACCCGGTGTGCGTGCGACTAGTTGTGCCACGTATGGGTCCCAGA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1271:1905:86956/1 +GAGACATTGGAAAAAAAAAAAAAACTGGAGAGACACATCCCGAGCGGGCTAATCCCCGATACTGCCGCGGTTAGC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1305:791:83162/1 +AGATCGAGGCTTTTTTTTTTTTTTGGTGCCCGTCGTGCCAAACCGCGTCATTGAAAGTAAGTGGATATCGAGTGA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2746:14451:329/1 +TAGGTAGAAGTTTTTTTTTTTTTTCGAGAGAGTCATACCCGTGTGAAACCGATAGACTAATGAATTGATAAAATC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2708:18759:118692/1 +GTGGGAGAAGTTTTTTTTTTTTTTGTCTGGCACTACACGAAAGTGAAAGCAACTTCTTCGAGAAGATCATTATAA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1588:3468:141624/1 +GCATCAGAAGTTTTTTTTTTTTTTAGTGATATCGACCCTTACGATCCCCAAGATCATGATCCCGCTTTGCAATGT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2921:15856:57939/1 +TCCTGAGAAGTTTTTTTTTTTTTTCCGCGTTAATGCCCAACTACATGCTACAAGGCTACATCTAAAATATGCAAT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1417:15923:95590/1 +GCATGAGATGTTTTTTTTTTTTTTTGACCTTTTGAATTAGAGGAATATATGCTCGCGACGTCCAGTGACGCACGT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2102:10514:9268/1 +GCAACAGTAGTTTTTTTTTTTTTTGGGTAAGCTGCATGGCAACGGTCCGTTCTGGAACCACCTCGTTGGGTTACG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:54:3501:193579/1 +GACCAGCCTCAAAAAAAAAAAAAACGGTGCGCTGTGTCTGTTGCTGAGGTTACGCTGTACGCGGCAGTCCTCGCC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2888:15166:71020/1 +TTCTCTGAAGTTTTTTTTTTTTTTGTTAAATGGGGACACCCGGAAGTCCCTCACCATTCTTGGACATATAGTCGG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:782:4801:172498/1 +TCAGCGAGGCTTTTTTTTTTTTTTAATTGCGGTGAGGGCTCCAGATTTTGTATTCGGGACAAGAAACCATGTGCA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1840:12105:187126/1 +GTTGTCGTNNTTTTTTTTTTTTTTCAATGTGAAATATAGATTTGCTAACAAACTCTTCTGTTAACGTTTGGGTGC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2153:1390:90531/1 +GCCGGTTGNNTTTTTTTTTTTTTTGAAGTGTCCGCTCCGCCTAAGCGGTTACTGGTACCTCTAGGCCGTCCCGAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2362:6530:45744/1 +AGGTCAGAAGTTTTTTTTTTTTTTCAGTAACCAAGCCTCGAGGAGTCGAACTAGATCGGCTAGGGCATTCGTCAG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:992:19260:109704/1 +AGCTGAGAAGTTTTTTTTTTTTTTTCGGCAATAAATCGGAGAGACTCATTTCTGAATTATTTTCGGTTTTAGTCT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:70:16099:51055/1 +CTAGACTGNNTTTTTTTTTTTTTTTGAGCAAAGCGTATACAGTTAACAGTCAGCATAGCATAGAGGATCTTTCAA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1603:19029:119019/1 +ATGAAATTGAAAAAAAAAAAAAAAGCTCCGAAGAGTTTTTCATGCGCAGAGGGCTATCTGCTGAGTATGTCCTGT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1872:3885:17311/1 +GATGACACNNTTTTTTTTTTTTTTTCCGGTCTTCGGGAGGTACAAAGCAGACTCTATATAACCTTGGCAGTCGGT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1547:15205:108796/1 +AGCTCACAAGTTTTTTTTTTTTTTCATGTGATTTTCTCTGACGGCCAGTGACACGGACTGAGGGGGCCACGTACC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1307:9506:116191/1 +ATGAACAGNNTTTTTTTTTTTTTTTAGGGCCATCGGCGAGACCACGAAGGGTTCCTCACACCCTTGGAGCTATTC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1293:13399:104992/1 +TGCTCGAGGCTTTTTTTTTTTTTTAGCCTATCCGTTAGAGTACACGATGACTTGTGCTTTGAGAGCAGCGTTCAT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2245:4790:19502/1 +TGAATCTANNTTTTTTTTTTTTTTCGACACCCCGAGGCGTTAGAGCTTTTGTTAGCAACCCGCCGTCGATCGGCA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1630:16736:9274/1 +AGGCGAGAAGTTTTTTTTTTTTTTCACGGTTTGAAGAGTGCGCAATCGCGGTATTGTTCAGTAAGAGAGTCGCAG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1548:4346:62294/1 +CCAACCAGGCTTTTTTTTTTTTTTGGTCACTCTTGTTTAATGACCCGCACCCCAATACAGTTCATGCCGACATAT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:32:14349:156816/1 +CTGAGGAGGGTTTTTTTTTTTTTTGACGAAAATCCGCTCCAGTCAAGTCGCCCCCATTCCGTCATCTCACTGTGC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1322:16932:51209/1 +ATCCTAGAACTTTTTTTTTTTTTTTATTTAATTTCCGATAGGCCAAGATAGAACCGATTCGATCAAGGCCCGCTG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2657:15332:141945/1 +CCTCCGAGGGTTTTTTTTTTTTTTTAATGAAGATCTAGTTGGGCCGCAAGTAGGTATAACCTCCTTGCGGATCAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:681:17025:20507/1 +GTCCGCTGTTAAAAAAAAAAAAAAGTCCCTCTTGTTACGTAATGACAATCTGGGAATGAGCGATGAGGGCGTGAA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1904:5984:48273/1 +GAGTGAGAAGAAAAAAAAAAAAAATGCCATAACCCGGAGTCGGGCACTGGCGCTCACGGCTTCAACGCTCAGAAT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2002:6401:148580/1 +CACTTAGANNTTTTTTTTTTTTTTCACAAAATTCGTCGCTCGACTGCCTTATCTAGATAGGATCCCAAGGAGCTG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2051:6388:64396/1 +AGTATGTGGCTTTTTTTTTTTTTTGCTTAAAGCTTTGCCAGAAGACTGTGTTTAGCGTTAAGCGACTGCGTAAAG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:333:16095:70445/1 +TGTTCAGANNTTTTTTTTTTTTTTAATTACTGCCGGCTAAACCCTATTGAGCGTGATACTCGGGTTATAACGCAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:235:4347:163096/1 +CGTAGCTCNNTTTTTTTTTTTTTTCTCTAATTGTCTGAGAGCATTAAAGCCGAGCTGTTACTTGGTCGCTAGCCC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2459:19347:74593/1 +ACCCGGCCNNTTTTTTTTTTTTTTATATGTAGCACTACCCTGCCTGGAGAGCTACAAAAGATGGTGTTCGGAGAT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:810:16180:96914/1 +ACCAGGTATAAAAAAAAAAAAAAATTCAATAGGAGGTACACACCAGCGTCACTAATGTATCAATTTGGGGCGCCA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1634:5631:85585/1 +CATGTAGAAGTTTTTTTTTTTTTTGTCCCCAATCCTGACAGAGCCGTATGCTTCGTGATCCGGCTTTGGGAGAGC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2962:11381:21217/1 +ATGTTGAGCCTTTTTTTTTTTTTTGATCTCCAGGGCCAAACACTGTTACCCAAACGTCACAGAGTCTCTTGAAAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2633:13732:192447/1 +TCGGTGCTNNTTTTTTTTTTTTTTCAATTTACATATAACTGTCGATTGAGCCTCTAAGAAACTCTAATGGAGTGA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2087:17402:125788/1 +AGTCGGAGGCTTTTTTTTTTTTTTCGGTCGCCAGCGCTGCTGGTGTTATCCGAGGGTCTGCAGTTACCTTTCTGG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:781:9732:169661/1 +TACTCAGAAGTTTTTTTTTTTTTTTACGTCATCACCCGTAGGCGTATACTGACAATGGCCACAAGCAACCCGGGG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1662:16108:137898/1 +GAGCTGAGGGTTTTTTTTTTTTTTCCTATACTTCGGACTATGTTTACACCTCATTATCAGGGACACCGTTCGTGA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:919:19941:114306/1 +CACAAAATNNTTTTTTTTTTTTTTGGAGCAAAAGCAACGGGTCGTGAACTGTACGATTCTTTTTTATGTGATGCC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2754:8750:58584/1 +TCAAGTACNNTTTTTTTTTTTTTTACGTATCCGCAAGTCTTCTCTATGGATACGTGCCGGGCCTGATTACCTCGA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:336:1521:165232/1 +CTTTGGAGGCTTTTTTTTTTTTTTCGAAACGTGCCTGGCTCGTTCATCCCATACAGGAACACACGTCTACTATCG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1355:15756:164948/1 +TATTCTAANNTTTTTTTTTTTTTTCACGCTCACGTCTTAGTTATTTGTTCGCGCCGCGTCTATAACCAACGGATG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2231:4273:60186/1 +GTCCTAGAACTTTTTTTTTTTTTTCGCAATTACCATTCCCATACATGACGAGCATAGCAAACGCCTCAGATTGCT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2316:6407:47755/1 +CAAGTGTGGCAAAAAAAAAAAAAAAACGGGGGGGGGGGCTGACTTCTTTAAACTTACCGGGGTACACCAGCCCGT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1180:9151:105392/1 +ACTTGGAGGCTTTTTTTTTTTTTTTTGACTATGCCCTGCTCCTTACCTTGCCATTTTCTCGTACTCGCGATCATT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:316:19348:39980/1 +ATGGATAGNNTTTTTTTTTTTTTTACCGACCACATTTAGGACCTCGCTATGTCCCTAGCTTGCGATATTGCCTAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2293:5852:41801/1 +CTCCGGTGNNTTTTTTTTTTTTTTTGCGCATTGCCAAGGGAGCCATGCAAATGGGCCAGGAAAGACGCTGGTACC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:194:2629:116686/1 +GACGCCGTNNTTTTTTTTTTTTTTTATATTATCCGCCAAAACGAGGCATCCACGAAGAACGTCGCATGTATGCAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:504:19694:38065/1 +CGATGGAGGCAAAAAAAAAAAAAATTCTACGGGGGACTATCGGTCAGTATATATACATGGGTCGATCCTGGGCGC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:40:16690:103457/1 +ACGTCAGTAGTTTTTTTTTTTTTTACTCGCGCCGTAGAATATACGCTTAAACGTCCGATTGCAATTAATGTACAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2665:14543:75970/1 +TTACAAGAAGTTTTTTTTTTTTTTACGACGAACTATGATTAGAATCTGGCTAGGGAGATGATATTCATCCGGCCA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:238:8472:192625/1 +AATACAGAAGTTTTTTTTTTTTTTTGGCCTGAATCACAGCGTTGCGTAGTGTTGCCTAACCTCTTATGACAGGCC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:457:4583:118080/1 +TATCTCGANNTTTTTTTTTTTTTTCTATATCAATAGCCAATTGCTCTGTCGAGGCCTTCCGTTAGCATCAGTGAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2124:7309:107717/1 +TTGGTTGTNNTTTTTTTTTTTTTTCTTCATATAACTTAAGCCTATCGTGCCCGATATCGGTATGATAGCTCCCGC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2793:15293:1228/1 +ACTCCGAGGGTTTTTTTTTTTTTTGCTGCTTTTCGGTAACGATCAACCTGCACGTGAAAATATATCGGTTGGAAA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1021:19152:59391/1 +CTCGTAGATGTTTTTTTTTTTTTTACAGTGTTATAGACTGTAGTTTATCTTGTACCCGGATAGCCTTATGGGCGT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1834:14761:60846/1 +ACCACGAGGCTTTTTTTTTTTTTTGAAGACCCCAGCACGGGTCCGTAGGGCTTGTAGCAACAGTCGCAGTGGGAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1085:18337:179806/1 +GGGAGGAGGGTTTTTTTTTTTTTTCAGCTAGTAGGCGGCGGACACTCCCGTTTTGAACCTTTTTGAACTACTGTT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:224:19111:133561/1 +GACCGTCGNNTTTTTTTTTTTTTTAGATTGAGCCCACTAACCCTAGACTCGCACTATTAACTCCGGGGCATATGG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2983:826:9193/1 +CTTTCAGAAGAAAAAAAAAAAAAATTACGAAAGGACAGGCAGTAATCATGGTCCTGAGTGTCCCTTTGTATACAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:338:13920:17267/1 +ATGGAAGAAGTTTTTTTTTTTTTTCTGTACCTCATCTCACTTGACTGGAAACGCACCATGGCTTAAGTCGCTGAA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1364:13563:167748/1 +CAGGGAGAAGTTTTTTTTTTTTTTAATTGATGCGTAGGGCTCGAAGTCTGCACGTAGAGGAGGTGCATTTTCAAT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:607:13103:139333/1 +TAGCTCCTNNTTTTTTTTTTTTTTCCCAAGATCATTGGATCCTTGTGTCCTGCGGTCAAGTACCGGGATCAGAAT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1747:1408:125904/1 +CTTGTGAGGCTTTTTTTTTTTTTTACGCTAACACATCAAATAGCGACCGGTCTAGAGGGAGACCGCACTATAGAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1148:16465:13804/1 +GCCGAAAANNTTTTTTTTTTTTTTATACGAATCGCGTTCGGGATAACTATGTTCGTCCTCCCAGCTAAGGTGCCC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1176:3250:145877/1 +ATATGAGCNNTTTTTTTTTTTTTTTATTCTGGCGCTATATGCGATTTAGAGCACAGGTACGAAGGCAATCTAACC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2792:6902:44593/1 +ATCCGGAGGGTTTTTTTTTTTTTTCACTGAGTCGTTAACGGGTTACTAATCTTCCTGTCTTGCTCAACTAAGGCA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2857:19869:180991/1 +CCCTCGAGGGTTTTTTTTTTTTTTTTCCATAGCGGGGGAAGCCATCCGAACGCAACTGGACATTACTATACCGTC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:755:19931:178280/1 +TGGCCGATNNTTTTTTTTTTTTTTCAACATGTGGGGTCCAACATACACACCGAGCAAGATTAGTTCCAACTGCTA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2480:155:63389/1 +TAAACGCGNNTTTTTTTTTTTTTTGTGATCCCTGTTCGGCGTGTCGCTCGCCTCACTACTAGCCAGGTGTGACCT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2843:10575:3131/1 +ACCCTGAGGCTTTTTTTTTTTTTTGTACTCGTGTTGACGGATTTCAGTCGAGGCAAGCACAACGGATTGATATTG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1770:4081:197491/1 +GTTACAATNNTTTTTTTTTTTTTTACGTAACAGATACCATCTTCTTGCACTCGCTGGATCATGAATCTGCGCGGT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2062:3642:13147/1 +TAATTTCCGTAAAAAAAAAAAAAATAACGGGTCAGGAAGCTGAGTCGTCAAGGGTTCACCTCTAGAGCCCACAGT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:193:6561:99678/1 +CGATTGAGGCTTTTTTTTTTTTTTTACACCATGTCGTCAACCCAAGCTAGACCTTCCTACAGCAGGTCCGGACGG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2562:3408:19709/1 +TATGTCAGGCAAAAAAAAAAAAAAAGTATGGGGAGCGGCGCGGTTTGCGCCGTATCGGTTCATTTGCCGATTTGC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:97:6403:33976/1 +GAAAGGAGGCTTTTTTTTTTTTTTTCGCCTTTTTCGTCTTGTTGCCCGAGGTTAGATATGTTAACAGGTCTATGC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1693:9857:189526/1 +ATTGGAGAAGTTTTTTTTTTTTTTGACCAGTGACACGTACCAGCTGTTGCCTGGTATGGGTCCCGAAGTTGCGGG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:723:3632:121816/1 +CGCACATCNNTTTTTTTTTTTTTTCTATAAAGCCCTAAGATCAAGGGTTATATGACGATGCCGACAAGCTCATAT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:590:19844:171664/1 +CGGATTCTNNTTTTTTTTTTTTTTCCTACCTTACAGTCGGCCCAAGGACAATCGATTCTGATGCAAGTCCATGGG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1274:4201:171834/1 +AGTGCTTCNNTTTTTTTTTTTTTTGGCCTATAATATATCTGTGGGATGCGCTAATATCCCTAACGCGGGGTTTGA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2299:4436:81404/1 +CTGAACGTNNTTTTTTTTTTTTTTCGTCAATCCAACCATGTCGAAGGGTTCTGCTACACTCAACTATGCAGGCTT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2962:13933:163719/1 +CGGACTGTNNTTTTTTTTTTTTTTGTTGATGCGTCGGCTCGGACCCTCCACGCGCCCTGGATCACCACAGGACAG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:79:10717:191559/1 +AATGCACTNNTTTTTTTTTTTTTTACCTTCAATGCGGTAAGCCCTCCGGCAGATTGCAAATCTGTTAGTCTAGTG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:728:6848:44913/1 +AGCCTGAGGCAAAAAAAAAAAAAACTGTATACACTCCACAGAACCCTCCACGAAGTTTCAGATTTAACGAGGAGC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1805:19028:29375/1 +TGCATAGATGTTTTTTTTTTTTTTAAACCTCCGTTCGCGGAGTATCAGATGATGGTAAAACCGGTCAAAGTAGCA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2525:11691:147816/1 +CTTCGGAGGCAAAAAAAAAAAAAAGAGTTTCGAGTATGCTGATAGCATCCCCGCAGCACGGTCAGCTAATGTTTA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1456:3792:55151/1 +CTGTTACCNNTTTTTTTTTTTTTTATAACAGCCAAGTAGTGCATTAAGAGGATAGGCCAAACCAGCCACATACCT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1759:15822:177463/1 +AAGATGAGCCTTTTTTTTTTTTTTACGCTGAACAGACTAGAATAGGGCCAAATCTGGCGAACTTGCTATGAATCG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1434:12716:156687/1 +AAGCGAGANNTTTTTTTTTTTTTTCGACTACAAACCCCCATACGAATAGATTGATATGGATCCACCGGGGACTGA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:79:14794:163993/1 +TATTAAGAAGTTTTTTTTTTTTTTGCATTTGCCAATTGACCACGATCGCGCACGCGTGGCTTTCGAGATGGCTCC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:313:3610:41049/1 +GTTTAATCNNTTTTTTTTTTTTTTAACGGCCCTAGCCTCGAATTGCCCGGACCGAAAAACCGTCACTCAAATCCA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:766:16681:39314/1 +TCTCCGAGGCTTTTTTTTTTTTTTTCGCACACAGACTTTTAAAAAGGCCATCCGAATCCCGATAGTTAGCGTAAA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:494:9805:67437/1 +TCTTTGATNNTTTTTTTTTTTTTTAATGTGCCGGTTCCATCCGGGCTACGGTCTTTCAAGCTTGGACACCGTATC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:638:6498:188736/1 +TGGACGAGGGTTTTTTTTTTTTTTGTTCTCAAGACTTGGTTAGCGTTAACTTTCTGTAAAGGGGTTGACGCAAGC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:350:3180:176176/1 +GGCTAGCANNTTTTTTTTTTTTTTACTCGCTCTATCGCCCATTTTGTCCCGTCCACTTACCGGGCTCGTTCGTTG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/t/data/test1_2.fastq b/t/data/test1_2.fastq new file mode 100644 index 0000000..afa3c2d --- /dev/null +++ b/t/data/test1_2.fastq @@ -0,0 +1,400 @@ +@HS3_9090:7:1008:11310:354/2 +AGCTCCGCCCTACACGTATGCTCTGAGTGTGTCTGACTCCTGTCTCAAAATTCATGCGTAGTCTGGGCCTCTAAG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1438:2003:100013/2 +GACTCTGTCCTGTCGTTGCACACATATGGCACCGGGATATATGGGGCCATTGCTTTTCATCCTGGCATAACCGCC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1271:1905:86956/2 +TAGATGCATCGTCTGGTTCTCAAAGCACAAGACATGTAGAAGATAATCGGCTTCCCTGCTACAAAGACACTTGCT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1305:791:83162/2 +CTCAGTTCACTTTTGCAGAATCCCGGTGTATTGTACGCGACAGTAAGGGACACGTCGACGCTCAAGGCTCTATAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2746:14451:329/2 +ACACCCATCTACAGCCTCCGAGACTCTCCCGAGTAACAACAATGTCCATTTCATGCAACCCGACTGTAGCGGAGC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2708:18759:118692/2 +GAGAGACTGTTGCAGATTATGCTGTCGGGTGTGAGGGTAAACCTTGCTCTCTTTCTTAGGCGTTCACCCCATGGT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1588:3468:141624/2 +TTCTAGCAGTGTACTAATGCGAGTGAGCTAAAACGCAGCATCGTTGAGCGGGTCATTTATTACACTATATCCGTT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2921:15856:57939/2 +ATTCTCACCGTGGCACGGATCCGAAGCTGCAAGCAGTTAACACATGATATCCGCACGTAGAAAACGCGTCCGATT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1417:15923:95590/2 +GAAGGGCCAGTCCAACGTCGCTAACGAGATGAACTTAATGCCGTGCGAGATTCTGCGGGGGCATCAGGGGTGCGA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2102:10514:9268/2 +TACTCAATCCAAATATTTCAATCCCCTCTGTTCAGACGAGATTAAGCCAGTACACATTACTCCAGACCCGGTCAA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:54:3501:193579/2 +TCAGGCTGTGCATTATTGCGTTGCGAAAAGACCATGATTGAAGAGCTCGTTTCAGCGTCCGGTGCTAACCGCATC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2888:15166:71020/2 +GAAAAAATATGTCTCAGTTTATCTTCCGTACTCAGTCCCTTGTAGATGATCGCATCTCGTCGCACCTGGGGTGGA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:782:4801:172498/2 +CATTATCCTGGGTGTAAGCGGCCCCGCGGCAAGTTGCTTGGAGTCAGTGGAGAGCATCCAGTAGTAGGTAACGAG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1840:12105:187126/2 +ACGCAAGCCTCACCTATTACCCTGCGGATCTCGCGATCACTATGCTGCGGCTACTCAAGTCTGGGCACAAGACTC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2153:1390:90531/2 +AAATTCAGCTTAGGACAACTTATGCGCCGGTTTCCTCACCGGGGTTTATCATGCGGCTGCGCGTACCGGCCTAAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2362:6530:45744/2 +CTGCACGTGTTGAAAGGTAGCGGTGGACCGCATGACCAATGGCTATGCTAACAATCGTTAAAAGTCGCAACACAT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:992:19260:109704/2 +GCGCATCAATAACCCTCTATAGAGTTTGAAGTCTGCGAGGGTGCCAGGGGCTTCAGCTCGACGAAAAGGGAAGAG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:70:16099:51055/2 +TGTGCAGTAAGCGACGTAAGCAGCTCGAGTCTCCCGGATTCAAGCGTACGGACTTGACTAGATGCGTACGCATGC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1603:19029:119019/2 +AGGTCGTAGAGTTTCCTTAGCTTGTAAGGTAGTACATAACCAGCTGGTGCGTGCAGTATGATGCTATACCCTTAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1872:3885:17311/2 +TGCGCATACCTGGCACAGCCGACTTAGTTACTATACGTCCGTAGTGCCCTATGTCATGCTCACGACAAAGTTGCG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1547:15205:108796/2 +AAGGTAAAGATTGTATTTACCACCGGCGCATGCAATTCTAGAAGCACTGACACGAACGCTCCTCAGCTGACGGAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1307:9506:116191/2 +CATTTTATTGATCTAAAGGCCATCTGCACCACACAAACGAGGTATTCCGCTCGAACTGGCCCCCTGAATAGCGAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1293:13399:104992/2 +CGTAGCCCAACCTTGGCTTGCTAAAAGCTACTTGGTGGACCAGTCTTGGACGTGTACACTCGTGATACCAATCCA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2245:4790:19502/2 +CTTTAGGAAACGTTAATAAGACGGCGCATCGAGATGTACACCACCCCTGCTGTTCGTACTCACTAGCGACAAGGC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1630:16736:9274/2 +CGCATCGACGAACCAACGTCCGCTGTTATGAATCATTATCTTCTAGATAACGAGCCTTAACTCAGAGGTAATAGA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1548:4346:62294/2 +GCATATATTTGCCTACCGAGTCTGTAAAAATTGTCCGTGATGTAACCGACACCTTCATCCCAGTTCCGGATAGCT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:32:14349:156816/2 +AAGGGCCTAAGGAACTTCATCAGGTACGGTGCTACGGTCACAACGTTTTGTTATGTGACCCTTATTGGGATGGAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1322:16932:51209/2 +GAGCCTGGATTGACCAGCCACTATACAAATATAGGAAAGTTAGAGATGGTAATCGCCGAAACATGAACCGCCCAG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2657:15332:141945/2 +GCTAGCATGAACCACTGGTGGCTGTTAGAGCTTTCGCAGTTTGGGGAGCCCTGACTGGGTGGGTCTAGAGCTTAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:681:17025:20507/2 +AGCAATAGCAAGATAACTGATTACCCAGCGCGCCATTGCGGGCGAAGTGCTTAAGTTCGGCAAATACCGAATACG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1904:5984:48273/2 +CAAGTGTTTATGGGCATCACTTCGACGACGCTTTAAGGTAGAATGTATTTAGCACATAATACACCTGCTTTGGCG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2002:6401:148580/2 +ACACCGACGAAAGTCGGCAATGTCTGCATACTCCTGTTGTTCGGAGCTTGTAAGAGCGTTGCATTCTCCCTGCAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2051:6388:64396/2 +CCACTTAGCAAAAGCTGTGCAAATGCATCTACTCTTTAAGTACAATGAGCGTATGTTGGTAATAACCGCGGCCGT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:333:16095:70445/2 +ACTTTGACTATCACTAAAGTTGGCGCTATCTGATAGTCCATAATAACCGTGGTTCTGAAAGGGACAATATGGCAT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:235:4347:163096/2 +CGGGCCTAGGTCGGGAGCCACCTGCATGGGTCTATCCTAATCTCCAACTCAGCCGCGTGTTCACTCAGCGTATGA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2459:19347:74593/2 +TCCTCGTCCGGCTGGCGGCTTCGTTCCTCTACCGCTGATGCAACCAGTGTGGGCCTCGTGACGGACTAAGTAATA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:810:16180:96914/2 +TAGGGACCCCACGAATGCTAGAAGGTCGAGGTAGTTAGTGATCTTTACTCGATATCTCTACCGGTCACCGTAAAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1634:5631:85585/2 +CCGCCTATTCGTTGCCCGGATCGTTAAAAAGACCGCTGAGCTGGCACGAATATTGGCAATTGACGTACCCGCGTT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2962:11381:21217/2 +AACTAAACTCCCCTCGCGCGACAGCTAGACTTGAAGGCCTATGCTCATCTTTTAAACGAGAGCAGCCGGGTAACA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2633:13732:192447/2 +AACTGAGGGTCTTATCCAATAAGCTATTAAGGCTACACACCTGTTTTCTCTTTAGCAATTGGACTACGCTGAGCA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2087:17402:125788/2 +GAAGTCCATCGAAGATACCACAGCATCCCCGCAGTACAACCTTTCTCGCGGACAGGGCCTTACAATACCGTTCCT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:781:9732:169661/2 +TAAAAGTGTAAGCGCACCTTTCGCGAGCGGGACCTTAGATTTAAGTTCGGTAATTGAGTCGACGCCACGAGGGAG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1662:16108:137898/2 +GAGAAAGGGAAATTTGTGACTATTTATGATGATCCCCTGTCACAAATTCTAAGATGATGTGCTAGCCCCCTCTGA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:919:19941:114306/2 +CGACTTGCTAGGGTCCATGGGTGTTCGTTATTGGTATTCTGAATTAGTCCCCTCTTTTTTAACATCCGAGTCCTG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2754:8750:58584/2 +TCCTGGTAGCGACGGTTACACACGCAAAGCCTCACGGGTACTGTAATATTCTATTTGCTCCTTCTTACTGGAGAT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:336:1521:165232/2 +GGGAGTATACGTAGTGCCGACAAGTGATCCGGATGGCTAAGATCCCCGACCCGTTTTGGTAGGTTCCGGGAAGCC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1355:15756:164948/2 +CGTGCGGCCTTATTGGGACTCGACGCCGTCACAGGATATATCGCGCTGAGCGTTTCATATATTCATCGGCTGTTT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2231:4273:60186/2 +AGGGTCATGACTACCCGCACTGATATCAGTATGTATGGTATGCTCGCGGGCATCGGGGAGCCAGTTAAACACTGC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2316:6407:47755/2 +CGTGTAGGCCCGACTCCATGTTTTGCATTGCTATTTCCCACGCCGTCCCACGTCTCTGCATAACTGGGAGGTATA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1180:9151:105392/2 +AAGTTGGAATGAACCTCGAGTTCAAGGAGTATCCCGCAGACTTTAGCCGTAAGGCAGACAGCGCAAACTAATCAG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:316:19348:39980/2 +TTGTGCAGTAGCGGATGTATTGTAAGCTGGATGCGGCCGACGTCGTGACCCTCTTATTGAGCAGCTCCCACACGC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2293:5852:41801/2 +CATGTTTAGCAAGACTTTCTCTCAGGGTGGAAGAACGGCGCCGATATATCAAACACAAATGCAAAGTGAAATAAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:194:2629:116686/2 +AGGGCGTACAACAGATGGTAACCACGGGGTTGACATAGACCCTGCACCTATGGATTATTCAGGGGATACGCTCAG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:504:19694:38065/2 +GATGGGTTGTGCGGCATGGTATCGTGGACTAGTTGGACCAGATGTAAGCGTGTATCGCGACTGTAACCACTGTTT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:40:16690:103457/2 +TCGATCACAGTGCATGCGTTCTATTCCTATGAACGAAAGCTGGACAAGAAACGCTATTCTTACAATTAATATCGA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2665:14543:75970/2 +CAGACTGAAAGACACCCCGCTAACCCCCTGTGACGATTACGCTGCAGGTGTTGGATGGCTGTCACTCGCACCAGT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:238:8472:192625/2 +TCTTTAGAGATCCCTGATTTTAAACGATACCCTGAGTACCGTAGTGAGCAGAGTATGTCAAGTCCGAGCCTCGTA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:457:4583:118080/2 +GGGATCGCTCGCGCCGTTAAGGGGTATATAGGCCTCCGTGACTTGTACCGGGCTCGACTTGGCGCCTAAGTAGTA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2124:7309:107717/2 +GACAGGGGTGACCTATTATCGCACGCAACTCTGGCAAGATAGCTATGTAACCCAATTCAGCGGCGATAGATCGGA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2793:15293:1228/2 +CGTTTCCCATGCTCCCTTAAACGCCTCATCATGCTCGGCCATTTTTCCCGGGAGCTTCTTGGGTAGGTTCGATTT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1021:19152:59391/2 +CCTGGGCCTAATCTATCGTAAGACCCGCGGTTCGCTCCACCGTATAACAGTCAGCAGGTCTAGTAGTAGCGATTC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1834:14761:60846/2 +CATCAGCGGGATGTCTAACAAGCATCATATTCGTCATCTAGACTTATCATCGTACCGGCGATTGTATAGACCGTG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1085:18337:179806/2 +ATCTAACTCTGATTATCACATATATTGGTGGCACGCAGAATTCGATGTCAGCGAGGCTAACACTCGGCAGCCAGA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:224:19111:133561/2 +AAGGCCATGTATCCAACTCAATCGGCCTGCCATAGCAACTCTGTGTACAATGAAGAACCACCTCCATCGTGCCTC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2983:826:9193/2 +CATACTGCCATGGACCCTAGAAGAACCAGGAGTGGGAGAAAGAGGTATCGGATTCCTGGGGGTAGTATCTATCAT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:338:13920:17267/2 +TCGATGGGTGTTTTGGCTTCATAGATTATATTGGCGCCCTCAGAAATTATTGCAACGTCCGCGCCTGACGAGCTC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1364:13563:167748/2 +TGGTGCTATGACGCGTTCACAAATGAAGCTCTAAGAGAACAGCAACACCACCCTGAATACGTGTACTCTGCCATA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:607:13103:139333/2 +AGCTTCGAACACGTACACGGTAAGAGTTGGGGTAGCGCGTCCACTGCGAACTGCCGGGTTAATCAGAGTGTGCGC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1747:1408:125904/2 +AGAATAGCGCAGAGGGAGCGATCGTCGCGTGACGGGCTTAAACTTTAGGTTTGACTTTTTGTTCTCAGTCCAGCT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1148:16465:13804/2 +GGATATACCCTGGTCGCTACGGTTGGTCCTGTACTACGACCACGACGGAGAGGTCGGTTGGCTTTCCCGACTGGG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1176:3250:145877/2 +CTTTGTCTGTAATAGTACCAAGTTGAAAGCTGGGTACGTGGGCTGGGGCGCCGACACTCCCCTGACCATATTGGA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2792:6902:44593/2 +ACTTGGTAATGCAACACTTGGGTGACCAACCCATCACCTCGATCAACTGCATGGGCTTGCAGACATGAGTCTGGT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2857:19869:180991/2 +TCTGCTATCTTCCCCAGTAACACATCTTGAAGTATTCTGACGCGCGGAATGTGGCTGAAGGTTCCACCAAACTGT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:755:19931:178280/2 +AAACTATAAATATCAAGCACCCTGGATATCAACGTATTCCGAGTGGCCCGAGCGCATTCCGTGCCTACCCGGGCC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2480:155:63389/2 +ATGGATGTCTACGAAAGAGCCTAGTAAAGTACTCGCCCGACAGGAAGCTTCACTTTTGTTAGTGGCATAGTGTCG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2843:10575:3131/2 +TGACTAGCTTAGAGAAACCAGTGTAGCGGTACACTTCTGTCAGGGAGTTCAAGCCGGAATTATATTAAAAAGGTA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1770:4081:197491/2 +CTCTTCGGAAAATTATAAATTTGCGAGCATATGTTTTGGCGCGTGTCTCCCATTACCATCGATAGGGAGTATGCC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2062:3642:13147/2 +TAAGCCGTGATGTCTCACTTAATTGGCTCAGCTGGCCCCACAAGTAAAGGCCTGGAAGTGTCATACACGAAACTC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:193:6561:99678/2 +GTTAAGGAACTTAGGTGAGTATCATTCTTCCACTAGGGCAACAATTTACCATCCGCCCAAAACTCTAGGGTCGTC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2562:3408:19709/2 +AAAGACGTCAGACTCTTCTCCCATTGCCGGACTCTCAATCCTCGACAATAACATACGAATCCCACGTATCACCAT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:97:6403:33976/2 +TCAACCTAGAGCCTCGCACATTTTGTAGAATACTGAAGGGTTCGATCCGATGGCTGTCCTGATGAACGCTTATCG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1693:9857:189526/2 +CACTCAGAACGTCTTTCCAGATAGTACAATGCGAAGGCCATTACCGTGGGGATTCGCAGGAGTTGAGAAAACCTC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:723:3632:121816/2 +CCAGCATGCTCCTACCTCCAAAAGCCCTTTGTCTAGATCTACGAGTAGCGCGTTGAAGACGTTAATGCCCACAGT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:590:19844:171664/2 +AGTTGGTACTATTGCGGATGAGGCCAGTCAATGGACATGTGTATGATCACACCACCGGATCAACCCGTACTTTCC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1274:4201:171834/2 +CAAGCGCAACTCAATACTTTGCTAGGACTCTCTTGAGCTGAATGCGGGCTGTAAGTTGGTGAATAAGGCGCCGGC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2299:4436:81404/2 +AAGAGGGATGTGAAACTTAGGACATGGGCATAGGATGCCAACTGTGGATGGGTTGTCATATGCGTAAAACCAACG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2962:13933:163719/2 +CGATCAACAAGTACAGGATAACTCGCTGATATCTTTAGCTCCGAAGCTAAGCAATGAAGTACTCACATTACTCTC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:79:10717:191559/2 +CGGATCTCGTACATGTGTCGACCCATGAACTATCTGCCTGTAGTCCATTGCACTGAATATGCTATCTGGTAAGTA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:728:6848:44913/2 +TCTGCTTGATTTCGCGTATGTTTCACGTAATCAGCTGAAAAGTATACGGGGCGAACTATTAGCTCCCCATCCGGA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1805:19028:29375/2 +CGGCCGGCATCCGGAATTTGCTCGTAAATTTAAAAAAGATCGTTTTGTCGTATCATCATTCCTTAGGTGCCGTCT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:2525:11691:147816/2 +AAAAATGTACACCCATTGACTGGTAGTGACGAGGTCAACGCACCATTATGCTATCTGCCCAGAGTCTTCTAGTTT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1456:3792:55151/2 +TTCACCGGTACTGTTGTGCAACCGAATGGAGGACGGTCTTTTGGTCCGAAAGAAGAATATGTATACAAACACCGC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1759:15822:177463/2 +TCAGCTGAAAAGAAACTAGTTAGGGGAGAGCGGCGTCGGCTCGGTGAGCACCCACAGCCGTAGTTTTACCAGTTC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:1434:12716:156687/2 +GCCCCCTACCAGGTGGAGCAAAGCGCAGTTCCGGTTGTGATATACAGCGCCTTGGCTTAGCATTGACCGCAATTG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:79:14794:163993/2 +TAGTACGATCCCGTACAATAGTTGGTGATGACTTACGATCATTAATAGTCCCGCGTGGGGGGTGATTGCTGAAAT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:313:3610:41049/2 +CGGCAAGAATCAGTTTTTTTGTTTCTTGCACAAACTGCTCTGTTGGATCCTGTGGCCGGACGAACTTGTGTTTAA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:766:16681:39314/2 +ATTACACCGTGTAAGTAGAGACTGGAGTCAAAGGTTGCGCGCGAGTCCACAGAACTTACTAAACCCTCACGTTAG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:494:9805:67437/2 +CGGGTCGCCAAACAAGCGGGTATTCCGTCGCACTCGCCGTGGACTGCGCAATTTGGAAAATGGGGACGGCGACAG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:638:6498:188736/2 +GAATTACCGTAAGTCGAGTACTGAGGAAGACAGCCGAGTGTCAGCCAGTGAACGCCGACTTGACTGTACAGCATA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS3_9090:7:350:3180:176176/2 +CACAGAAGCGTTTACACTTACTAGATTCACTACCGCGGAATCGTCCAATCTACCAGGAATCTATTTCACCGCAGT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/t/data/test2.bam b/t/data/test2.bam new file mode 100644 index 0000000..18aaa1b Binary files /dev/null and b/t/data/test2.bam differ diff --git a/t/data/test2.bam.bai b/t/data/test2.bam.bai new file mode 100644 index 0000000..e357bb6 Binary files /dev/null and b/t/data/test2.bam.bai differ diff --git a/t/data/test2_1.fastq b/t/data/test2_1.fastq new file mode 100644 index 0000000..7c9e9d3 --- /dev/null +++ b/t/data/test2_1.fastq @@ -0,0 +1,400 @@ +@HS46_3182:5:2405:11074:43547/1 +CAGTCTCCNNTTTTTTTTTTTTTTTGGTGCTACCGAATATAGAAGAACACGTTC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1341:11922:194601/1 +GGCATTATGCAAAAAAAAAAAAAACCTTCTGTATCGGATCATCCCAACGGAACT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:272:17688:181326/1 +GTCAGGCGNNTTTTTTTTTTTTTTGTACCTCCCTGGGGCCCATGTTCATCCGAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1564:15853:100266/1 +TCCTCGAANNTTTTTTTTTTTTTTCCATAGCCTCGTATTACTACGTTCGGTCTG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1752:18919:39066/1 +GTATTGTCAAAAAAAAAAAAAAAACTTCAGGATAGCAACCAGGCATGGACGCGA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1408:2410:48242/1 +GGTACGAGGCTTTTTTTTTTTTTTACTAGCTTTACAGCTGAAGGGCATCGGCCT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:381:4299:36056/1 +CGTAAGAGGCTTTTTTTTTTTTTTTTTCTCGAATATACTGTGCCCGCGTTTCCA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1073:8428:128989/1 +CATTCGAGGCTTTTTTTTTTTTTTAGGTGGGGCTACACCTCAAAGTATAACGCG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:771:4279:15727/1 +GGCTTAGATGTTTTTTTTTTTTTTAATACGTTGATCTTTCGCTACTAAGTTGCC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:584:1207:121853/1 +GTTCCGAGGCTTTTTTTTTTTTTTATACAAGACATAGGGTCATATTGTTGTCAG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2354:16276:7208/1 +TGGCGAGAAGAAAAAAAAAAAAAACATACTAACCGGATGATGGATTCTGTAGTA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:80:5305:54651/1 +GGGCCAGAAGTTTTTTTTTTTTTTTAAAGACCCGAGAGGGCCCTAGGAGATTCT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:106:2551:71099/1 +TGGTGCGGNNTTTTTTTTTTTTTTGTGTTCCAGTACCGCTAGTTGTAGAGTGGG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1817:7790:154523/1 +CAATGAGAAGTTTTTTTTTTTTTTTTGATTGACAATCTCAGAGACCCAAGCTCC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:178:16976:72000/1 +AGGAGAGTAGTTTTTTTTTTTTTTGCTGACCATCAATGGGTACGACTGCCAGCC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2547:4151:12793/1 +AAACTAGAAGTTTTTTTTTTTTTTTTTCCCCGAAGACTCACGAAAATATCCCAA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2291:14901:34113/1 +GCTAGAGANNTTTTTTTTTTTTTTTAGGAAGCCTGAACAAAGCTACGTTTGGTC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2934:2829:31483/1 +AGTCTTGAAGTTTTTTTTTTTTTTTTGGTGTGCTTAGCGGCCAAGACTCGGAAT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1249:16218:147727/1 +CTTGTAGAAGTTTTTTTTTTTTTTTGCATGTAGCTTACGTTGCTCCGCAACGAG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:553:771:39903/1 +TCTTCTGAAGAAAAAAAAAAAAAAAACGGGGTATCAGTCTGAAGCACTTCAGCA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1223:9525:65794/1 +GGCCTAGAAGTTTTTTTTTTTTTTTCTTCATTATGATGCTTAGTGAAAATTAGC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1191:17978:187481/1 +TCTAGGAGCCTTTTTTTTTTTTTTAGGAAGTAGTAGATATTCCCTTTCAAGGCA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:3:10917:33452/1 +TTTCTAGAAGAAAAAAAAAAAAAATCGATAACGCTGTCAACCGCGACCCCGCAA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:177:10045:179524/1 +TTTTCACAAGTTTTTTTTTTTTTTGGTCGTTTGGATGCATTTACCGTGGCCACA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:946:13566:137436/1 +GCTCGGAGGCAAAAAAAAAAAAAAAAGACTAGTGGTTTACCCGGAGGTCCCCGG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1148:656:9363/1 +CAGGAGAGGCTTTTTTTTTTTTTTCATCATAACCTCTCTTATAGGTGGTGTCTT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1498:3307:70020/1 +CTGATGAGNNTTTTTTTTTTTTTTATCCGTCCGTAGGAAGTCGTAAATGTCGGG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2248:4900:51665/1 +CAATGGGCTTAAAAAAAAAAAAAAACGAGGAGTGTGGGTCATGCGGTGCCGTAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:807:7391:123290/1 +GGTCGAGTNNTTTTTTTTTTTTTTAGCTAATTGAGCAGAGTTGGCAAACTTCAG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2455:17864:5895/1 +TGCTGAGAACTTTTTTTTTTTTTTCACCGCGTGGTCTGTCCGTGGGGTGCCCGT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1257:14923:50658/1 +GGCCGAGTNNTTTTTTTTTTTTTTCGGGTCTCAACCTGGAAATTTGAGCGGAGT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:492:12915:12514/1 +GACATCAGGCTTTTTTTTTTTTTTAGTGCTCCTTGTGTGCGCGCTTCTCCATGC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:235:7270:100350/1 +TCACCAGTAGTTTTTTTTTTTTTTATGGTACAGCAATAGGTGAGACCAGCGTCT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2260:14085:3683/1 +ATAGTAAANNTTTTTTTTTTTTTTCACTACATCGACCTGTCTACTTTTGCACCC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2169:9651:142664/1 +ATGGCAGTAGTTTTTTTTTTTTTTTTACATCGCGTGTAGATATATTTGTCGTTT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:540:14663:48756/1 +GCTGCGAGGCTTTTTTTTTTTTTTTTTACTTATCAGTCTAAAAAGCGAACAGGA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1091:11404:166341/1 +AAAGGCCTNNTTTTTTTTTTTTTTAAACCTAGGGAGCGCCGCTAGGTGCAGCTA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:608:7328:131607/1 +GAGTGAGAAGAAAAAAAAAAAAAAAGTTGAATGCATTCATTCTTTTCTAGTCGG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:522:3049:47753/1 +CATAGAGAAGAAAAAAAAAAAAAACCTGTTGGTCCCCGAATAGTTTGCGGGGAG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1283:16887:169154/1 +GTCATGAGCCTTTTTTTTTTTTTTAGTTTTTTTGACGTACGGCGTGTGGCCCTA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1095:4319:4606/1 +ACTTATGCNNTTTTTTTTTTTTTTTAAAAGGAATGCACGTATCTAATCGCGTCA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:752:19585:108953/1 +TCCGGCAANNTTTTTTTTTTTTTTTATTTGGTGAAGATCTTTTCCAATCCGTGC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:752:3015:99805/1 +CACTTGAGGCTTTTTTTTTTTTTTCGGTGGTACGTAGTTCAGGCTAATAATAGG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:332:4846:77772/1 +TACCTCAGGCTTTTTTTTTTTTTTGCTTTTAAGTGCCACGGGGGAGGACTGGAA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1140:784:66805/1 +AACGTCGGNNTTTTTTTTTTTTTTCCTACTCTTACGCCACTGTCGGGAGCCGAT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:473:18338:33594/1 +TTAATAGAAGTTTTTTTTTTTTTTCTAACTCGTTTCGCCCCGGGGATCGAACGT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2702:3903:96584/1 +CGAGTGAGGCTTTTTTTTTTTTTTGAAATTGCTCACTTACGGACCTTCACCACC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1797:11052:97278/1 +TCGTTGAGTAAAAAAAAAAAAAAATACGCTCATGGTCATTACATGCGCTTTAAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:820:13922:80241/1 +AAAGCGAGGGAAAAAAAAAAAAAACGCCCCAGTCAATATTTTTACTAATTGGTG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:555:4732:87380/1 +CAGGTAGAAGTTTTTTTTTTTTTTCCTGACGTACTACTTGAGTCGGCGGCAGTA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:659:12505:103681/1 +GCGTTGTGGCTTTTTTTTTTTTTTGCGCCTGTAGTTCAGCTCTCTAGCTGACAT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:59:2864:12297/1 +TCGCTCGANNTTTTTTTTTTTTTTGTTACGAATTCAGAGGTACGCACCAGCCGG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1383:4754:130496/1 +ATTCTGACGCTTTTTTTTTTTTTTCCTCTGTCCAATTGAACAGTCTGGTCCGGC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2240:16766:66728/1 +CATGGAGAAGAAAAAAAAAAAAAATCGTTCTTGATGACAGTAGTAATATATTAA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2100:7817:14667/1 +ACAGTAGAACTTTTTTTTTTTTTTGAATCCAAACTCTGCGCCATAAAATCATAG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:895:13916:72786/1 +AGCGCAGATGTTTTTTTTTTTTTTGGAAATGGATGTAGGATTTGCCAAGAGTGC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:418:13660:161873/1 +AAGAGAGAAGTTTTTTTTTTTTTTAATGATGGTCAGTCAGCAATCACCACCCAG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1283:12361:37079/1 +ATTGGGAGCCTTTTTTTTTTTTTTGCCGCAGCATAGGGTTCTCAATTGCTGAAG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:347:5935:115159/1 +TTAACCGCNNTTTTTTTTTTTTTTAGGAGACACGAGGATAATCGTGAGTACGAA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1983:1677:46718/1 +TTGGTGAGGCTTTTTTTTTTTTTTAAGATGCCTTGCGTGGATGCGAGCCCTGTA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1380:1444:154221/1 +GAAACAGCNNTTTTTTTTTTTTTTCATCAGTATCCCGCGGTCGAGAGTCAAAGT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2040:5556:131112/1 +CACTATCGNNTTTTTTTTTTTTTTGATGCCATTCATCGGCATGGGTGCGTTGTG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2789:7394:44852/1 +GACACAGAAGTTTTTTTTTTTTTTGATCTGCAGCTTTATGCCGTAATCGAGACT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1215:14424:11602/1 +GTTGTGAGGCTTTTTTTTTTTTTTTAAAGTGTTCCCACCCACCAGATAAGTGCG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2475:14123:112282/1 +TCCATAGAAGTTTTTTTTTTTTTTTGGAGGCGATCCTATGACGAAGAGATAATC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2546:19759:182333/1 +TTTAGAGAAGTTTTTTTTTTTTTTCTGCGGCGATCGCCGGAAGGCGTTGCATAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1849:14231:162499/1 +TTAAGACAAGTTTTTTTTTTTTTTTTGTCCAAAGATAATTCTCTGAGAACAGCA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:662:17029:25695/1 +TCCCTGACGCTTTTTTTTTTTTTTTTGCGAGATTGGACGCGCCGTAGAAAGTGA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:566:4782:154015/1 +CGACCAGAAGTTTTTTTTTTTTTTTTCCCGCCCTCGGCGAGATGAAGGCTCCAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2085:5947:93787/1 +CACCGTAGNNTTTTTTTTTTTTTTCTACCTCTTGGACCTAAACCAGTACCGATC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1281:315:57513/1 +CCAACAGAAGTTTTTTTTTTTTTTGAAACGTATTACTTCTAACGTTTGGAGGGT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2225:9077:172869/1 +GCCACGAGGCTTTTTTTTTTTTTTCGTTCTAAAGTTCTCTCCCGTACTTGAGAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:373:3420:105949/1 +TTAATAGAAGTTTTTTTTTTTTTTCATTTATGATCAGGCGGGATGTAACATATG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1805:9722:68857/1 +CACTGTAGNNTTTTTTTTTTTTTTTGCATCGATCACTATTCCTTGACAATATTT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1350:15373:181220/1 +GTTTAGAANNTTTTTTTTTTTTTTCCTCTATTCATTGGCGTGCGGTGCAGTGCT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:44:17607:19283/1 +TCCTCGAGGCTTTTTTTTTTTTTTACTCCGTCGTAACGATTCGCCGCCCGCCGA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1732:2108:100386/1 +GTTCAAGTNNTTTTTTTTTTTTTTGGTATGGGCCCTCAACTTATTGCTCATTGG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:250:15883:128840/1 +ACCTCGAGGCTTTTTTTTTTTTTTTCCTGAAAGTCGACGACTCTCACGAAATGG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2113:7932:11007/1 +GGCAGAGAAGTTTTTTTTTTTTTTAATCAGCTGTAGTACTGACCGACCGAAAGT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1891:12437:47748/1 +TTCCGAGAAGTTTTTTTTTTTTTTTACTTTCGCTGGAAACGGCCCTCGTCCATT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2369:5530:14886/1 +TGAATAGAAGTTTTTTTTTTTTTTATGCCATGTAGGACCGCCCCCAGGTGTCAA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1742:17056:119120/1 +CTCGGCAGGCAAAAAAAAAAAAAATTTCCATTCCAAAGGACTCTTCCCGAACAT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2687:831:113506/1 +AACCTTCCNNTTTTTTTTTTTTTTTCATTTATTTGTAGGCAACAGAATGCGCGA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:451:18172:127749/1 +AAAGTGAGGCTTTTTTTTTTTTTTGTTTTATCCAGAGCTGTTAGGGTTCTTTCC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1906:6140:154562/1 +ACCCGAGAAGTTTTTTTTTTTTTTTAGAATCACGATGTCAGGACGCAATCCGTC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:745:10175:155937/1 +TGGCTTGAAGTTTTTTTTTTTTTTCCAACAGCCAGCCCGGCACACGAGGTACAG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2679:13605:42940/1 +CACACAGAAGTTTTTTTTTTTTTTGAGTATGTTTACGATACGTGTGCTAGAGTC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:525:19584:47076/1 +CGGACAGATGTTTTTTTTTTTTTTATCGATTGACTCCTGATGGGTGATTCCGTG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:541:14390:18530/1 +GCAGGCGANNTTTTTTTTTTTTTTTTTCTCCCCGCGGCACAGTTTGATTCGTCC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2371:19010:102788/1 +AAGACTATNNTTTTTTTTTTTTTTAAAGGATATAGAAATTATTCTGGGGCCGAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1477:14980:45119/1 +CTCGGAGAAGTTTTTTTTTTTTTTGGTCGATCGCCTTCGCTCCTGGTGTACCGC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2719:11588:38543/1 +AGTTGAGGNNTTTTTTTTTTTTTTGTCGTTAGTGAGGTGGCCAAGTTTAGCCGG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1734:8972:136811/1 +GTACTACAAGAAAAAAAAAAAAAAGGAATGACTCCGTGAACAACGCTTCACGAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2208:6046:184805/1 +GTACCGTGGCAAAAAAAAAAAAAATACCGATCGCATCGGCCGCCTCAACGATCC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:173:5726:155286/1 +CGGGCAGAAGTTTTTTTTTTTTTTCGCTTCGTGGGAGGGTCCGGCTGACTGATG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1308:11903:50795/1 +AGCCGGAGGCTTTTTTTTTTTTTTGGAGTCTCATACGAGCACCACCCTTTTGGT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1898:10887:153698/1 +TCATCCTTNNTTTTTTTTTTTTTTACGGGTCCTGTCGCTGCCGTAGACATGGAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:772:13347:20065/1 +AATTTAGAAGTTTTTTTTTTTTTTCCATATATATACCCACGCATTGGGAGGGCG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2480:4129:148337/1 +CATATGTGGCTTTTTTTTTTTTTTTCAAACAGACTATGTCATGAAAGCACCCAA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:256:18917:52058/1 +GTGCGAGAAGTTTTTTTTTTTTTTGTGGCTTCTTATTTTGTGACTTTCCAGATG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/t/data/test2_2.fastq b/t/data/test2_2.fastq new file mode 100644 index 0000000..77f752e --- /dev/null +++ b/t/data/test2_2.fastq @@ -0,0 +1,400 @@ +@HS46_3182:5:2405:11074:43547/2 +CTAGGGCTGTCCAAGCAGATATACGCGCAACACAGAGTGTGAAAATAATACTCA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1341:11922:194601/2 +TCCCTCCAGAGGTTAGTCATAGTCAGTCCCCGCACGACGTGGGTTAGGGGTTGT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:272:17688:181326/2 +GTGGGGATGTATTGACTTCGCGGTTCTGATCCCACTGGGTAACAACCGATGGAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1564:15853:100266/2 +GTGTATCCAACGATCGCTTGATTGGTGCTGTTGCGGGGGAGTAAACATGGGCAA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1752:18919:39066/2 +GCGGACCATTCAGGCAGATTGACCTTGCCTGCTTCTACTGCATGCACATATTTA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1408:2410:48242/2 +ATTATATGGATGCGTCATTTGTTCTACGCACCGCCTGTCCGACCAGTAACCACT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:381:4299:36056/2 +GGGGGTAGCAGCTATATATCCCCACGGGCTAAGAGCTTAGACCCAAGAGGCGGT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1073:8428:128989/2 +TTATATTCTGGAAATTGTTAAGCGTCATAGGCCGCAAAACCCGATTATCCATGA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:771:4279:15727/2 +GCTTTCGATTTACAGCGTGTGATGAGTACACTCTCGCAACCAAGGTCCAGTGTC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:584:1207:121853/2 +GTAAGAAGTTCTACTAAGGCAACCGGCTTAGGCTGATCAAGAATGTCGGGCGCC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2354:16276:7208/2 +TAGCAGGCTGTAACCCGTGAACTAACTCGGAGTTGTTTGATGGCCGACGACACT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:80:5305:54651/2 +CTATCCACTCGTAGTCTAATAACGCGAGTAGCTCAAGGCAAAGTGTTTGCTCAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:106:2551:71099/2 +TAGTGACGCTTTGAAAGGTCTTGACTTTAGTAAGCCATTTTTTGAAGCCGGCGC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1817:7790:154523/2 +GGATTGTCTTTTTCAGGCGTTTGTAACCTGTGCTCTAGACTTTATTGATCGTTG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:178:16976:72000/2 +GTTGGACGTGTTTTTGGGTTCGGGAGGATTCCCGAGCCACTCACAGGATTTATG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2547:4151:12793/2 +CAGGCCTCTCTACTCGCGATATTGTTAACTTGTGGGCTGTTCCATGCGGCAACC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2291:14901:34113/2 +ACGGCGGAATACGCTCAGACCGAACGTCTTAGACCTTACAAGTGCTCAGTATAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2934:2829:31483/2 +CTCTGTGACCTCGACTATTTATACATCCGCGCTGGGCTTTCTGCGCGGTACATC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1249:16218:147727/2 +AGGCGTGGGAATTTCCATGTCCTCGCCTTCGAAGGGCTAATGAGAGAGTTATAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:553:771:39903/2 +AGCAAAATCCGTCGAGGCACCAAAGTTGGTCTTTGCCCTCGCGACAAGTGGCGT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1223:9525:65794/2 +CTAGAGATACACCATTGTTACCCGCGCGATAAATACGCATGGGGTGCCGCACTG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1191:17978:187481/2 +ACATCGGGTTACGATCCAAAAGTTCGGAGTAAATAGACAGATACTCAACTGTCG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:3:10917:33452/2 +CAATGTGAGGTACTGATCTTCTCGGATGTGGGTTGCCCGATCAGAATTCATTGA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:177:10045:179524/2 +CTCCATCAAGCGATCTGGATCAGCATCACTAGCTCGCGTCGCTCCGCTCGCTAA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:946:13566:137436/2 +CTTCCGTATAATCGCTTCAGCTTTGCTACAATTTCCCTCGGTTTGAACAGTCAA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1148:656:9363/2 +GCTCTGCAGTACGAACAACTCCTAAATTGGCAACGCAGTGGGACCAGAGACAAA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1498:3307:70020/2 +GGACATGCGAGAAATAATTAAATAGGGGAAGACGTAGGTTCCACTGCTCACAGT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2248:4900:51665/2 +CGACCGTACGCGCGATTAGTGACGGACGGTTTACAGAAATAACGTAACGGAGGA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:807:7391:123290/2 +GACGAGAGTGGCTAATACAAAGTCTCTCGGGTCGGCGTATGCCTAAACTTTATC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2455:17864:5895/2 +ACGACGTGTTCTCGTTACAAACGCACCTTGCCCTAGCTACGGCAGCTTGTGAGC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1257:14923:50658/2 +ATGGTCCGGTTTAGGTGAGCACATCGTCACCCTCCTAAAGCGTCATAATGAACG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:492:12915:12514/2 +AACTAGTGCTTCGCGATCCCGAACTTTTCTATGATATTAGGTTTACTCTCAGCC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:235:7270:100350/2 +AGGTGTAATGTCACAAAGTCATGCTTCTCCACTATCGCCTTGAACTAATCGCTT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2260:14085:3683/2 +AGCATTACATATGGTCACTCATGCAGGCCCGACCCAACGTTGCTCATCTGAGCC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2169:9651:142664/2 +GAGGTGTTCAAAGACTGGTTAAGGTCACTGCTGGGAGGACATCGCATCTCTATC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:540:14663:48756/2 +GAATTGTACGGGATTTACCCGAGGCATCAGCATTACATCTCATAAGCCACGGTC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1091:11404:166341/2 +CCCCGGTGATTTCGTGATGATGCCTTTATAGTGCTTTCTAGCCCACGCATACCT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:608:7328:131607/2 +AGTCTGATTTGCTCGCTTTGTCACGAGAGGGGTATCACTAGATACATGGGCATG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:522:3049:47753/2 +CGCAAATCTGGCGAGTTCGCATCGGTTATCATTACCCCGCTCAGGCCACTTCTT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1283:16887:169154/2 +TGCTAGCGTTTGGTCTCGAGTGGGCGCAAGCGCACTTGCAAAGATACCCAGCGA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1095:4319:4606/2 +TGTTCTGGTCCGATCTCCGTCTATCTTAATGATACGAGACGATTCATCCAAGTT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:752:19585:108953/2 +AGTTCGCTGGTTGACCTAATTGTAAGCCAGTTGCGGGCCGTAAATGCGAGGATG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:752:3015:99805/2 +GTCAAACGGGGTTGGCAACCTCGCTCTAGCGTAATGTCTCCACCTCGGAACCTC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:332:4846:77772/2 +GACAATAACACAAGCCCCGGGAAAATTTATCGTCTAGTAGATACCTTGCCATCT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1140:784:66805/2 +TGGAGAGTAGTCATCGCACTATACTCACACCTGGAGAGTTTGCGGTTCCCCTAA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:473:18338:33594/2 +AGACTCCGCGGACCGACCCATCGTTCTCGCACTTATGCGGATAGGTACCGATTG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2702:3903:96584/2 +TGATTGGGGATTTACGGCACGATCCACAATCGGCTTTATTGCGAGTTCAATGTG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1797:11052:97278/2 +TCACTCAGCAGTGCGTTCCAACTTCTCCTCGTCAGCGAACACAGCATACTGTGC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:820:13922:80241/2 +TTTATAGGATGCGAGAGCGTGAAGCACGAAAATCTCGTTTTACTCCCTAAAAAG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:555:4732:87380/2 +GTCCCTGGAACAGTAGCGTGAGCAGATCCAAGGCATCTGGGATGTTAGGTTTGG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:659:12505:103681/2 +GTGAGGCCGTGCAATCGCTGCACGCGAGCAACATAGAAAAGGCGATTCTGCCCT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:59:2864:12297/2 +CAATAATTACGCCCTTCTCGAATAGTCCCCGGTGTCTTCATTACTTTCCAAATT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1383:4754:130496/2 +GGATTCATCATGCTAGTACAGCGAGTCTCTACGGTAGGAAGTTAGATTAGCGCT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2240:16766:66728/2 +CGTGAAATACCGTGCATGGAACTAAGTGCCGCGGTTGCCTCTACATATCTCATT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2100:7817:14667/2 +GTGAAATCGAGCAGTGAGTTTGCCAGCATACGAGATCACCGACTTCATGCCCAT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:895:13916:72786/2 +GGCCAGACAATGGCAATTGCACGCCCCGGTTGCCATTACGTGGTTTCAATATCG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:418:13660:161873/2 +TAACTCCCTTGAACGCGTGCCACGGGCGATCGAGATGCGAAAAACACGCGCAGA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1283:12361:37079/2 +TAGCTCCGGGTGCGTAAAGCCATAGACTCACGACATATATGGTTTGTCTTGTTT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:347:5935:115159/2 +AGCAGACTTGCTAAACCTGCTATAGCGTGGGCACCTGGATCACAGAAGAGTTTA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1983:1677:46718/2 +AGCTCTTTCAAGTACTTATTGAGTCGACATAGAGATCCGCAGACTCATCCTACA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1380:1444:154221/2 +TTATAACCTGCTATCGTGATGGAGTTAACCCTTTGGTGCTAATGCATGCTTAAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2040:5556:131112/2 +GATAATACCGACGGTCAACAGCGAAGTCGTTGGCTATGGACTCGTCTGTAACTT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2789:7394:44852/2 +TGTATAATCCTATGTCCTACCACCCTGGCGGAATAGATTTGCAATTACAGACAG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1215:14424:11602/2 +ATAAAAGGGAAGCCATACGTGCGAGAGCACACACTAGGTGACTAGTACTTCAAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2475:14123:112282/2 +CTCTGTCAATGCCAAATAGACGCGCATTTATAAAATAATAAAGTGCACCATGCA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2546:19759:182333/2 +GATATCTCGAATAACGGGCGTGTTTTCCTCTCATTGAATTAACGGCGAAGCTTC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1849:14231:162499/2 +TCATTCGTTAAATCCAAGGTGCTCGCCATCGGGGAATATACTTACCGATGAGAG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:662:17029:25695/2 +TACCTGATGTAGTCCCTGCTAGAAAGTGAGCAAGTAAGTTTGGAGACGGAAATC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:566:4782:154015/2 +CGTTAGTCCCCCCGTGTAGATATCAATTTTACCCCGGGAGTCTAGAGGCGCCGC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2085:5947:93787/2 +ATTTTAGCCAATTTATAGAGCAATTTATCACGCTGAGCTGGGATGACCAAGACG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1281:315:57513/2 +AGGGAACAAAGACTGTGTCGAGGCACCCTCCCACTCGCCATTTTGGTAGCGGTT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2225:9077:172869/2 +TTGGACGTTTTAGGAGGGACACTGCTTCGACTCGTACATGTCCGACCTACCTTG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:373:3420:105949/2 +CGACATGAAGTTTCCTACATCTGACTACACGGGCCGGAGGTGATTGGTCCATAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1805:9722:68857/2 +GGAGTTTAAGCAAGTTATGACCAACCCGCAAGTAGCAAACCAACCCCTGATAAA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1350:15373:181220/2 +CATTGAGCATCTTCCTGACAACAGCCTATGTTTGTTTTCCGCTTACGAAGCAGT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:44:17607:19283/2 +TACAACGATCGCTTTGGGCCTTATTTCATTCAAGCAAGCTTCCGGGCGCAAAGG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1732:2108:100386/2 +CCAATTGTTACCTATACTGAGAGAGCTCCAATCTTGACTCCAAAAACTTGGGGG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:250:15883:128840/2 +ACCACAAATTGTGACTGTATTAAATAGAAGGTGTTTTCAGAATAGCGCTCGGAT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2113:7932:11007/2 +GCAGAACTGCGGAACGGTGCTAAGGAAAGGTAGATACCTCTCGTTCGGTGTCTG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1891:12437:47748/2 +TCGCGGCCGCACGAATCCATTGGATGGTCATAACTGAAGCGTTGGCTCAAGGGA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2369:5530:14886/2 +GTTCACGTATACTTGTAGTCACCACGCGCCGGCAACCCTTTTCTATAAATACTA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1742:17056:119120/2 +TTCGTATGGCTTTACATAACCCTACCTAGACCTTCAGGCGCCACAGTCCTCAAC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2687:831:113506/2 +CACAATGAACCAACCGATGAAATCCCCTCGTTAGGATTCAATTGTCCGACCATA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:451:18172:127749/2 +GCGAGGAAACTCCGAAATTGGAAGACGCCCATACTGCAAACCCATATCGCTCCG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1906:6140:154562/2 +GTCGGCATCGCTCGTGACGCAAACGAGTCTTTTCTTAACACTTACAGGAGAGCT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:745:10175:155937/2 +ACCCAGTTAGCAGCGGACAAGTACCAGCCTGTACATATAACTCAAAAGACTGAT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2679:13605:42940/2 +GTCTAATGGTGGGTTGTCTAGGCACCATAACGACCCGTAAGCGTCATCCCGTTA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:525:19584:47076/2 +TATTAGCAGTGCGAGTGGAACGGGCGTAAGCTATTGGCGATTTTCTTTGTCCTG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:541:14390:18530/2 +CTTTACTACCATATATTTAGAAAAAACTACAGAAGCAATCCCTGGAGTACGCGG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2371:19010:102788/2 +TTAAGCCGCGTGGCAAGTTATGGTTAGTTCCTGATGCCTAGAATTTAAATCCGT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1477:14980:45119/2 +TGGCCTGTTTGGGGCACTAGGCGCTGAGTTTGCAAAACTAGAGACCACATGCGC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2719:11588:38543/2 +ACCCGTAAGTTGCAACCTGGCTGCGCAGCAGGTGTAGTCGTGGACCCGGATCAG ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1734:8972:136811/2 +CGCCAAACGAAATGGTACACACTGCCGGTGTGAACGTGGAGGGGTATCTATGCA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2208:6046:184805/2 +CTGCACCTGTAACCGATGAATTGCACCCAGTTTTCGCGCGCACATTAATTGCTC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:173:5726:155286/2 +TGATGGGGAGGCCAGGCACACACCTCCTCGATTAGGGGGTTTCAGTAATGGTCT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1308:11903:50795/2 +TCGATGAGGCGACTAGACGGAGTCCCGGTAGCAACACGGAGCCAAACGATAGTC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:1898:10887:153698/2 +CTTAACGGTTCCGCTCCTTAATTGATGGGTGGTAAGTACCTTTGTTCTCTCTGC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:772:13347:20065/2 +GGATACCCAATGCCCTCGATCATAGTGGGACATTATAGGCAACCGATCGTTGAT ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:2480:4129:148337/2 +AAGCGAGACAAGCACATGGTTGCCAGAGCGATGTCCGAGGTGTATACTGCGCGC ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@HS46_3182:5:256:18917:52058/2 +ACGCGTAAATGGAGGCAAACATTTGGGCCGTTTAGTCACTGCGAGTGGCTTTAA ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/t/data/test3.bam b/t/data/test3.bam new file mode 100644 index 0000000..1b34227 Binary files /dev/null and b/t/data/test3.bam differ diff --git a/t/data/test3.bam.bai b/t/data/test3.bam.bai new file mode 100644 index 0000000..d98d42b Binary files /dev/null and b/t/data/test3.bam.bai differ diff --git a/t/data/test3.fa b/t/data/test3.fa new file mode 100644 index 0000000..954edce --- /dev/null +++ b/t/data/test3.fa @@ -0,0 +1,360 @@ +>1 +cCaCcGAtaaGtaTgttaaccCgAgTACATTaCcGGGCGaatcaTccgAAGaAtTCTTgaAtaagCCcCggcgaCtctTG +GataggatCTtgctcgtCcaTatGCtaagCAGcAtgGcGTgGaATAgTTcGgTCAtTtTTTGccTCGgacCAGtTgctga +ACCaCAttgctgcTaGcgagaaCggCaGCtGAGTgCAAAGtaTTaaGgTCGTgCaGgtTTTactaaCACGCACctcCtCg +CGAGtgATCGAGCtcggcgCgAtcGtcaGcgctaAGgCtAGAAGggAaaGCacCAcTAaaaGAAgGCACacgCtCtcAtt +gCTacTGgcTtcGACtGGGgcAtAGCaAaTggTTtgggAgCGaTaGACGcccaAgCTTGgtGggATgGatCaCaTGgCTT +GAcaCtAacaCTcTTgAtAGAACggtggttGAgGtATcCaGGTCgGTtGATtCGctATgTGGaAtTggtgaTAGAtcaGg +gCAtggCcAATTctTAGCTTcgCCggcAACcTCAgcgCGaCACCAtATGtggTGatTCTCCGGCtAaTaAAAcCTaTGtT +CgAtACCGCtaGccgCCCacaTaActTaAcAaACcgCgACTTgACCCtaAAgGgtCtCGgTacAatAGggcAaTGGGaCA +gAcGGcGgGaCcAAaGTGtaTCCaTCAcgGgCATTCaCggGCgGatcTCCgTAGCCGcaaCAtTaTtTGgAGcGgGCCAC +ACaCgACctAgGaAACcAagCAcGtgtacAaTTATTGTAtAGTcctgAtAAaACgcCTgGAATcCtTtCCGTCaGTcTag +ATAcgcGaCAaCgTActgCAagaCTACtTAtTctaACGtTcTgACATatACaTcTTgCacatAGttAaATGGcgatatcG +tGAacacgGTtTtTTtcGccgAAaGACtccCGGGctCCCGgAatTACAGaaATgACCTgTGtcTAgCCaaCctgtCTTAc +gaAtTAGGctCacGGtCGTcaCGcGGtggGACaggTAgtCGcAcCtCTTcGgAAcGgActCcATGGcTgatcacAcgcaa +gtgTGGaggtGaTCGGtCtaatTCgtGgGattCgcCAcattaggttTatcTCtaGCcgCctgTaTttatcttCtgcCCGg +GccCccAaggTctCCgAtcccgTTTcTTACGgatCTACcGAattGttTtcAacGGaaactCtaCGgCTTTgcGCaGacgC +gAtctGgAAagctcCCtACcGCtTtaGgaaAAtGAGCAgAAgAtttgCTTGCGgcGGgcCggTGAGAgaGCCGtGaGCTC +CTTCAAtTGtAgTcaaTtgCGcActAGAcctGGaCagtAGacCcgcACTAtcgAtCTtaggtcGGcCgaAGttCgCaTgg +tAGGGATtgTCtGGTCcAataTtacatgGataCCgcGTaacgGCaCCtTCGTtgtGgaagCAAgCTCGCcgCGgCCtTCA +CgggcCtCctGGaTCcCCaACTTtgaTAAcTACGgcACCAgActgaTATcGCtaaCCctTCaGTTTTAAggGaTctTAca +gATtAgTTTGtCggTccCTTctCaTtACAtaTTTCCAgAaAggtattctAAccCCatggcTATCAGAAgggAGGGtTatT +ccTtgcGtggAaatcaGCAaTaGcAtATtAATGTtAgaAgTcTCCTAGaGgCTagGTccgtGgaTtgAGCGAagacgtTT +GgCaCTGGtctTGTATaagTgTagTgcTAGcgaAtTcTtGctCtaaACggtGgGaCgtCTTCCGCTGtGTCAAgAactga +cCAttTgTCTgaaTaacggaGtTcAagGcGCCtaCactGAGccCAtTggtCaaagTAcAtGTGGTgaCtGcAGccCCGCC +cCAtCGCCGACTaAcgAcAgAggGTcCAtagTTGAtGcCatCTtaaCAGtagCAaCTTGcTcaTcTTAggtGtCGAAGtC +GgatgtggTaTGATtgAaTgccGTCGAGAtaATGGACgGgGGgAAaGTaGttaTGGCGtATCTATtTACtCCgcccacGg +ttgAagCAtTTTCtAAggaTCAcCTgctaTgaCaGCgActACTGATTtCTGttcTaCacaAATaTggGAgAgTGaagacg +aGgaGtacTACAggTTTGGGctACCTATagccaccGtctCcATTgctttctGtACcCTTacTcaCcTTCcaAgtGtCaAG +GCAcgTCtgtCAAAtaTctCaaAtgGggtataCgacTcaAGTtAtaTaGgGgaGAcCGCccAtgcTgGgcaatTGaCaaA +GAtCAtgACAtAaGCtAtGcAACcGtcCcTcagTAgtTtAaCctcCTcgTgCtACaCaAAtcATtaCatATGgggcCCcg +GAgaCcTaCCGTAGtCtCgGccTGCtCcacccatcACTTcaaAagAAtttTccGtGgGggGcCGctACaCaaTCaaacaA +tCcttATGTTCctTgatACTGtaTTGgAGCGGAcGAcTTCgTgaGCgTAaatgAgcCATCgCctCaaGAaTTcaCtTAcT +AgaaGtCaGtgccTcCTCAcATtggtTggtGAAatcCcgGGAGATCacaccGCCGTTacGagCgCAaGGgtGcatttcTT +agTttGttATAGaAgAAAccatTagatgACcttcTcTGgacagtgTgtTGgtGATGGgagGCCtCCTaCGTCtCTCggCg +GtAtctcTaTcAtCgtCGGaACcaACgGTTGCgcCggCCCggatatAAagAaaTccgTtaAAGtGgaatTCCGAAgCCAa +atTgccggAtCcAaCAcgaTCGaaTtTTTGaaTAAcAAaGtaCAGAcGccTCaAggAgagCCaTtcGcGATttTATAcTg +GtcGGTAcaaTgtGTtCcgAcCaTcCgTGaaaCATttTAaCTaCAtatGCCCgGgGgCggGaAcgCAgaatctAAtCaAc +cCgATCGTcgtcGTgTgcGaaCcGcaCAcAgTgAGAgttcaaccgCctGGtCGGcATAtAACGTcTggtCaCcGCGTAAt +TagGaGTttTtttAaCgGagAgCgGTTATATagTgCGcgacacGtGcTTgaAgtgaTAtTTcaattcacCctGgCAaAcG +CTActgTtTGcTGaaatTctgcACtcTCATCtaggacaCtTtGtagtgGgTAgGccTcaTTTTAGCttaAGCcacatTtg +aAtttGcggCatATaAAGCGtggcCGtCTAAGTTtgGGTgGCtTGaaAcAAgTtacgggTaAcaAcaAaTATCtTTAggC +TgctTggaGCATgcaCTGcGgtagGTCAGccgTTAgAGTGaGaCtTgacgTTcacgTcggTaAagcGCTAaTTAttAtaG +CTGCTGgcaTCtGGGccAGtGGtCTgcgctCccCaTAgctGgcACaggactTAccaaATgAcctgttTgaCTGTCTCCtC +tTccAtcCAAaTaTgCCTaACctGcGaAGtCgaTCAGAGGtAtGcttgAtTcgGttgGAGGcCAcCatTCtCAgaAaCTg +TtCggAtGaGaGgacTCCGCAGgacctATcAaatGTTtgatTAtcgtGAtgaacgcTcAcAgtAATagTaCgCAacTCTC +GAccCtgcgTCTGCCTagcgcCGtacCcaCCAcgATAttcgAGcTtTCccgCaGatggacAGgTtgATctTtATcAGtGG +cCgTcGtACaCtaCCgCGcaTtgTTgTCcaGgGaaGAcTttGAatGAcgaACTAtcGgaAaAAacTGtTaCCTagatGCG +GCgtGTaCttCtCAAgagCtcacCcaTagGtGcTAaatgCGTgAAAAAccAgTaAttAAtGGtAGgatGacGGcCTaAtC +GTtaGaaTtGActGGCgcCAgaATggGacaaCgCcGggttAAaCccatATttGCtGTTgacacGcACccgCgctCagGct +TaTcTTcaTGaCGcgGGaTatGCgCTTagCtcCAtTCGAcctTCTaatGTTTCtcAGTaatctCGGgatCgGcttGCgtc +TTtAtAtAGgTgTaGacagtagaaaTtGtAaCcgtaGgCcgtgtaTtCcaCTAGTaagcCACggGaAttcGTgtGCacAT +acTGtctCtCgATAatccAgGtgactgTGgggTGgtCatAaGAGCCAcgCtcaGCgGtacTGGCcgaTaCaTAtAGAaCG +cgGaAtccATCgtatctTGCAgCTaAGATgaCCGCaTtCCaTctCTCaATCgCgGcGATcgATAataGgcTActtacgaT +ttcATgTCTAActGAcCCGCAATcAAactAGAcctAtcGaaTAAcCAtccaCgGCAcGGGGatcAggGCcGTAaAGcTcG +TCcACtCACATggTAAACTTtgcaGTttCCCAgCataCAaacAACGCctcAatCtGtCaaAgTtTaCgGGGgATcgTcCt +atATcaTatCtCgGATAaaTAtAcAGgAaCTAGGTCTCtTTTccGatACgCtATGCgCGCTctAcgAAGacgCgaaCAgA +GaCAtTtgTgaCGtagGatAcGctGcggatAaCcTTGAGatggCagTtcCCTGcgtgtgagTccGGgtAacGtGAtaact +tctaccgGaTCatgcAATCTGtcggTaTaACaTCCGaTGcGgGGTAacCtCaaTGtATaAtActcctGATcaTActCaGt +AAgCgAAcaATGTtTAAATtgTCTGcgCTCCTCcCCattaaCGtaTATtTAccAGttatGgcaAaGAcAAACTgAaCtCT +tAAcGgcTgcCCctTGAtTttAaCACCAcAtAGttCTgAgttCcCcCCcTGtTaGtCcTTgTaAaATAacCCCCAgaaCT +cAatgGTAAtGCtcTaCaacTcgCGgGGTGATacaGATCtaTagTctTtTGAGtGgTatggTcGCcgcCcGtgggaGGGt +tcTctTAgAgATgaCcaGccACCgTtgTTtaTggGGTGCCtTAaTAGCctcAGTttCtAatAATAtACAGGCggccGctG +gtctgTgTGgAAgaAgtAaCCCAggatACACGcatcTctatATAgcgtccTttTgCGcCgtATAcatacGGGCTTgTcCt +ggtcgGtccTATGTcAtGtCctTctaTcTCaTgggcaTcCatTAACaGtctcTTCGtCaGttGcacgtgattGGgtTtAA +ccCGcATcAgtTcTGtgCGGcTtGTcttcAtCAGaaTTaCtTcCTtcGcTAtGgGCaAGaCACagCtCcTgGGGtGATtC +GtAaaaaCaTGatGCgTcaTGgcatGTGACcgTgCAGAtGtCtCGGCTCGCCACTGtCgcgCCGtCgACaaTCAgtATgG +ttcaTtCgaTgcCAtaaatacacgcagtGcAgcTTaCcAgcGGTggttgCAATttTgttAcgGtCcccAgGgGcCCcCGa +tAAgAGAcATTGtTTtacAcTtTatATGTTGgGActcAaCcTCTgcAacgaGCTTGgAgtGATagGACtTCccgCatgAa +aAGCgcACATGcGcggAagTAaagcTATGaccCAAaTCGGCtCTcTGaAGcgTAgCAcCCtagGCCActagttACTatgA +gaTTcCcTgcTgggcCGACCTcacccctTTAGCgTGCgcTCggtgcaggGaatTACTcTtCGAAtGTTTatTcCaGCcGG +CAtcagATgaatCGagtCAgGttgTttCgcaTTAAgGGaAcTcCagTGGaCaTTtTCcGCTctatGCctTtAaTTtcCTa +TcCGTcctgGacCgAGACcTCAGctcttGTcGTtCcTAAaAGCGtcACaCggaCCtttcgAtttgcgCtGtCagcgGGCC +GcTCaCATGAAttgtGGaAtcGAtACATcTGAGtGcTGTAaCAACtGtCAAGagtggAAaGtCATTAcGGTcTaGTATTC +GcGatTcagAGaTatcTttaTtCcAgccgACaataGagcTgtgAtTCttatATaATGGGAcCaGatGtTacGaaGGcaCA +tgacGCttActGTcgCAtCTccGCTgAaG +>2 +GtAATGcCcgAAcGgacCgAtTaaGTtTgGATgacGaTAAAgtcGgacCcgCaGcTGgCAGaaTcttcGCgAgCgTAAcT +GTaaactaacGCaAgAGttGCGGcACgTaCcTtTTTAGacgAgGTtCgCtagCcGGGCGctAatgaCcTgagagTaGagc +aCcGGGActaaTGgCcaGgttagTaCACaTcTgCatGTaCcTGGTcCaaCGacgaCacatAgCtcgAttTtTTgTTatGt +TaAGagggAgGatcATAaACCgtgGAgagATTGGGaCAtGCaaTcTGcaTaTAATcCaGcCaGtAgttaCaatgGAagtc +ttaCAcacGgTgTtcTGGcGtCgtAatCTggACcCcagCCCgGGTcAAGCcacGaACATtGCCGccTaTaAAaAtttGaG +tccTaCccagatgtaTAAagATACtAggTtGgCTACgcAcCtAtgtatatacCgAtcGtctGgACTcCCccGaCcACtgG +gaaTGgatTCCcAGgTAgGacTgtGTCctaTGCtATcttgCCCatCacGGAcTCaTCtctaCgACggTccTGAGGgaAgg +GCAcgAcCCTaTTgGgcCCTCTGACAgCCctgGCcTAtCgTaGCactaAAtatCcgCCtCTcCtACgCCctaAgGaACGT +AaaATCTCtgCCAatgaCaaAGtTAtcgttcgATCcGTtaCGGGgAATTacGATtGAtGttaCGtTgAgGgTTaCttGtg +GAgTTAaCCcCgaGCCcTGcaTGCcTTGCgcGTtcgCgtGCaGCaAtGtAGTCCCgggAgcaGtTaaGgAtCCtaAGctC +TcgGgAtcAagacTTTTacAtTaaAGAaatgTatAGaggCaaTgcCCtGAgaCTcTCcTagCAcTAAGcTCtAGTGtgtC +tCGaGtACAcCcGTcGGtTaattacgAagACTCactAACtTgTCAggactCTTacAAGcTtgTcAAacCCgATGaATCCt +AtcAggaGaGTCCCGTgCttCaACcttATcAtGctgTAatCCgtTaACTTCAGcGGCGTCaAaGAATGAaAaGggAGctA +TcGcGCtttAtgcCTaGCTgTTgcCtaCgtcCtTAGCaGtcaATAACccCgcactgTCgGcTatTcTGGggggGaTtgAc +aACAgTTATAAGatatAgATacCGTtAtacgctcAagatAcggtAtAaATgAcGaGCTCCgCGCgtAGTgtCCAAGgcta +ggctCGaCaCCgGgtCTaaTGGTCGTtTATTTCaGgAtcttAtAtaaGttaaCgGcttccCggcctGtgggCtTAaAATc +TACAACcGtAaaCAGAtcgcGgATggAgtggcTGcGATtattGtTgCCAcCgaCtaCCcTAAtCGCtcGATgaCCccaaT +gGAAaagaaTGTAGaTCatACgTGgCCtTtCCAcgaggtCCtcAGcggAcGcgATcgtGacTcGgAcCTACACTGtCaTc +GTAGaatGAaAGtgCcgttAGAcaAaaTGtGTAATtaGAAAcggggaaATCGGgattaCAagGCtgcAcTaactaAGTgC +AtgCTAATCGatcaTCCatGTTatAggTGtCCgttGGgCGGaCgAGAATTtTgGTtATcgtTtcTtAGatcTCgAaGttt +ttGttATTCgcTGcCcctGCcGtActtGctGGTgacaGTgttagATaGcggctaGaaGGAtACtgCtGGccTTaCcCTGA +tCTCcaGatCTgacgatTTtCaCCAttcAGaGAatGgcAactAtgGaagGgCtGTaTcgGgCCctaccccCTACtCgaCc +GATgtTcTATcTaAAgccCCtGCgaTACTtcATAtaGTcAatggtcgtatcaAgGTgACGgATCTaTCCgAcaGTattTa +cgGTGGCGCgcTAaCGGcACtcAaccGtGTGgAtgGtggCgACcatGGTcGTaTgttgCcTGGGcGaaGACacTaaAtGc +aCTaTccTtAatCTTCAAaaTgcTggGTGATCgTGAAcgGGTCCGAcACAcAtCTtTGTcAcaggTgACTtAAcAATCct +ctTaatgagTcacgGCAgCCCcCTGCaAattgTaacGatcaGaTATGAAAcTGTTaGCAtcgttTGcTCGtTGCccTgaC +AtgAcGaCCatagctacGgCACcAGtgccTATTcatggAAGCTgcatCtCAGgCcgTAAGtgcaAGgCAGAgAcATgcCt +gtGaGaCCAtgGtAcATAACTTAGCGgTCcgAcCcgGTCtCGAaCTtCCctattAGAGgGctGgAaGtTTactatgCAGT +aCGcAAAcGTTAaaTgAaTGgGGCtCgGtAaATcAaGCgccATaaacGgCATCgAccGtcCGgCtAAGGGGCAgTtGTTT +GAATGGGtctttCtCGgTgCtGAtGtaAGTTccaCccAGAGcCTTcgTaGtcCAtaTcTCtTCATTAGAattCtCCcgtc +TGcggcTTGACAACcGATcaAGCCaTCcAGcaCtccGtGatGTctTAATTgtATGtTagcTtcCtCAAtCTaTGTCcCGG +GTagGAcCgCACttGCGGCGCaaAtGaACAtTCaaGATtggcTTCcTcTtCTccCtTgTGaTgAgaGcaatCCGCCATAa +GGcaCccgcAacGCctttGTtcGGCctcAaTgAcacCGaGactTtAaCtGTcAcAtGCgCaTTaaTtTCAtCAGtcAcgt +aTtAAtcGcTgagAgatcTTgtCGACTaGgacTcaGAtGCcGGgAAagACaaCggTtGtgtGgTACtcgCTtCTGGCATT +TATgGGaTaAaTcCTCatGtcGTAaGgTatAgTAcGCGTtGaATGcAAaTatCCGtCtgtcaTGcCtaTCTaGGCCaGGC +cAgTTCCCCAATAtCTgAaGcAtACgTTCcTcgtgaCtgtGttatTGgtcacTCTcTCgaCgaagGTcaGctTCAAcACc +aTggtgGcGgTTgACCtGGGgGacaCCTCtaagTTagcTgAaGCGAtgATAAgactTatTtGgtacTCGGTCAcCgACtC +ATCAaaAcAtTGTgtgCGGcaCcAtAgttCTcaaAgGaAatgtaCCATctGTGttcCGCgagTgAcGCAcaggGCGAtAg +CTTAaTAtAcGtacagTtCacCaGCTGTActaggaGaCaCTCCtcgGtaGgcCTGcGTgTcTTtTTCgACtccatACCGt +cgCccTTCgGCgAatgTcaCaaCAGTgAAtAtCCAACAaCAccTatGAaGCAGtACAccCcataaaATaGGaTgtTCAGg +aATAttgTgtcgGGgCaGActaACAttgcCaAcGgTgGTttaaACACcGgtcgACactGatcGCagtACCacTaacaaAc +aGAACgAGCTttctaCCCTgtcCGAggggcTTgTTCCTGcatCGcgAAcCgTTaACgaAaCgCAccAaatTTgCtGGTaC +TcaGATagaatCacCTtCtaGCacGGTgactgacCcaTAtCgcTtgctttAGtTgAAaACAgcCgCAGTTgAcGtgcCgG +tTTcaCttcGgCTAtAAttcTTCcTggCAgTcAACgGaaTcTtGCAgGTtGGCttGgACaTagcacgaCaAATtCATttA +gACCtCGGgGGATCGgcgtgtGtTaatGatGcaaaaGGtAcCcCCTATgCGcCCCaTCacAGACatGaTGTaagCaGAgA +ggtaTtAgatggGGcaCaCTAGACaACTTgcCCCGCCGagTGaaaGtTGctgaaCCtgCtCtAtGtctACaGTTatTatc +gtCAtcCCgTtacctcctActagGctagcCcAtCGtTTcCCGccgGCacCTGGaacCaTAtTtCtGgctTcCTAGAgGtT +tCacAAaCgGTAccgcGaaGCAcCTcGAgaTTAGtagaaCcAAtaaccAGacCAcgaGTCaTacaGtAACaCataTaGcG +TACgGGGTAACCAAtCaccACctTGCGcatTTAtgccAccCccGtCcAAAAtTgTTtgTacATGatATaGCATaGCgtGA +gcgAAAtCtTGCAaGTGAccAgAgTgggCagAcgcttgGcCCagaCCCttccTCcCCaTttAgTTGAatgGtATGatTTc +tAtgACaAAAtgtcCcaAtTACcgatcACTTaGagtcATTtATgCCTGgAgCAAAagggTGaTaCggaTcCGGGTTcACa +CccgAgcAaTcgtAtcgATtCGGcCCtGTGaGccAtCtcaCtgGgCAaaTTtggAAggTtgATCgAGAAAAattGGAtCC +CtCCTCcACacTTGGTggaTTgGTcTGaCacAgAAgAggacgCGtcCGaCcaAGcAgAtcCcacAGGtaAcCAActcTca +TctCgTcTaTCGTgGACGTctaGaaAtacTTACTaTGGTgACaGtAGaTaTGgtggTCATTAactgTttAGaATCATGgt +AAcccAGtCttAtTAAtCaCtGaaaAtacAATCCGgcCtacCggGGTtcgTTTCaAgCgCCgGcAgAgCGTcttcAAAGC +acCgCGaccActTCCAAtAgCgGGctaacctATaTCggCtccTtACCtgctCTTAtGCgctagAcAggtAGCGgcTAtAc +GGGTtTCGGTTtCAAATCccaAcgtttcATgTGtCTtGTCcGcAgaatgGCgCgtACACAAcAtaTgATtTCGCggagcC +aaCagTcgCcGGGcAcgcGtaACAAggcGAcGCgAacaTcgaatAgaGCgtCtCTGTGaagAGTGccCCGAtccGgGgCg +aTTTgaCTcGGCACgCccTggACTtaAGCGActTactAaTctcacTAGgccATgTTTGAAgGCaCataCAtaCTGatGtt +cAtgTGgaAttaCaatGGGAGGAgcgcCaGgCATtTCaGtaGtatAAAgCaAtcCTgTatacgGGctaaTGctTtaGgaC +gTcGGCCccctaGgCTCTTgCgTGgTcgCGtTTcTGCtCaaGaAGtTTGtCccAcTatTTCGGGagagCCtGATaCttaT +TtGtaCaaGAtaTaCcTtccGGTcCaaCCaTtAcACCacGTAtaGCTGGGtcagATCttCGAATTccaaCCgtccTaCGa +gCcCtCaCcGaGcgtgGACGtCaGTcGGGTgGtaatcCTgGggGcGCgCCGgTcgTGgTGtGgGATgaAgtAactAtCTt +aCaGTCGGGaaaGcacAATaCtcCgCCAgcaAcTAtggGtcTGATGTtaTttgCtcGtgCAtacCaGAGACTggaaATCc +CCgaaGaTAgggTCCcGTatcatcAgACcTttGCTacaaAtTATCTCtgCagCcgtcGaCtGgtTCCgttCTCtcGTAtg +ACgTcGaATacgctCgtAtcgCTaAaGaAtgaAtGGagGtCGTAACtgGAgaCcAGtCcGActCagTcgtaCgAtcgGtG +aaAGGgaACcaCTgCgTCaGtTACTGcAtTTCGCcaTcCAATCACgAaCCcGcCagAAcaAAaTCcCCatCtGCtcCCct +gGggTGGAAAgCtgTAGTCGcCgcAcacCcCtCcaaCTCGaTGtgCATCTGtaGcTagActaTtAtGGcgAgAGgGTgGg +CCCaGccAAgTgAacgGtGttCaAgaTccaaCttAgcTgTcGtGctTctATAcgcACtAaActaAACTaaaCgcCgggaT +ACgGATTTgttcATaAtGcGGgaCGCAAgaAAGcgcgCAcAtgttGgCtgctcACTTAATactgAacgacAACtaAaaCG +CAcAaAAaGcCGtAggaAtTaGgTgGgcggtGgATTCgActGGCCGTtaCTatTgcaAgaACgGagaCGAtCgaGtAGtt +aatGATagTGtgGTCGAtcGCGcTTtatGcGAacaTAaagGTttAtAaCtaTCaaTcGttcaAatgagatCTgcttgGCC +GTtACcCAAaAacGAgTtAGAtcccAGAcgAAgGcaacTTggGcaGCGgTtATAgTTtttaaTgGagAcaAaTgtTggCc +tTgaTGgTACgcGGCcgCtTAcAtGtcCCTggcAACGTAgtGcgtCGgTaAAgcCccGTCTTCCaCAGcAgGtTGtagcC +cacagCTTggGCtaAAAtCTcacaGagTTAtaggAaTgACcaCGcgtCgcTgGCatagaAgGGAttGcTCAtATCTCCtA +TtTGaAAtgcCAcGGgacCTTgccATgAtCCgttCGTgCTGGcgGCTTaaaAAAgAtgAACtGTtTaGgTcAgAaCgTGG +TGGgAGaAgAcaactCcgCcGAcATCCGtCGGtgAGGAgGctCtGcACtAGccGaTCCAaGTTTgtaCtgGtCactTttg +TaAGaaAtCgcATaggttAcCACcAcCGcaAACcGTTTtaTtatGGAgAtgCatCtGTcAgcAtAacGTgCagcCgAata +GtGGaGtaaaGCtGagCagtGgcacGTgctTGGaagACctgGATCcTGTtTTtCaaCAcTAtAGTaTCtGGgTGCcGgcA +ACtaCGGtGtAcAggcGgAggGGcTtgAAaCTGtAtaTAAAgCcAGAGcCagAgTcTGtgcctACgCgaAAGcTgatTGc +GcAGccAttCGTcGGctgagtAgCGagaTCTaAaACAgACtgATGGcggAgGagctACGcGgctggcCttAgAGtgCggG +aGGatATGGcaAAtCATtGgCgcCcagAgaAatCCtagaccAcatgGGcCTcCAgagCAAaATCCCaCCcTGccaTGGCT +TgGgcaggAgCAaTgGGaCcaGtGAtgaGTcgGGgtaAcTaACctagCgAgGTCCGGCgGATTAaGggTaTAAGaCgTaA +CatAGccCAccctAaCGTtaAatTcgacAcAgTCcCGTTAcCAAAtTTCtCgccGTactTaAtGCaGCGgtTTatgcggC +cGaGTATAGtcGGgaTTgCgAtTAGtttatActtatCCTcACAGGAcCcgACTtGgcgagCTAaCTGTGGcaaGGCtCat +CCCAtCacATatCgcAgagTgCAGcACCAcctGcaCGTGCactaGtctTCaAcgGGaAacGAGCCCatgtgGggCCgAAC +cCtccTtcGtCTAttCCaAaGgTAaAttagcAaataGgAcGGcgggCTgatCggGcTATGCtGcGCtATGCcGTcgtgCg +aCCAGtaaTcCTCCGCaAcTCACaCCACCgagATaTAcaagccACAgCtgccCGATGAACGtaATgtgCGTggtaCtTAg +gAgtCGtTccCgtaacGCAAGgtTCcccTGcaCGTgcGTTggAcTgGaTTCtTTaAaaacAatcGtcgGTTCGccgTTcA +CttgTacTACttaGGgTtCtttcTTtAaTTctgAAAtAGcTTCcTaGGTcTTACTAcgTTgtACtAaTTaACGGgAtTAA +taAcctGcTAAtGTACCGtaTcaCAgtCaACGgTTTggCTTaCAtggTTcgTTgcatCtCATgGCCGACtGATACgGAcT +GagtCcATGttCGCgGgcaGGCTaCtGcCCgAtaaaAcCcTCATgGTTtcatCTAGacCCgGGAACtCtTTCgTTTacgg +GTcaGatGGActCtAcTCGCgcCTTGTTAGGAccCggcGGCTtTaGGgaaTcGaaaaTTTtAgagACgtCCCCGaaaAGg +TtCgTgtCgcgGAtgTCCAGGActgctAggGgTTaAatCaTcGTTCtgcaAcgTcCGAGcTaCGaggtcCAAttCgAGTG +GTTcaatACcgcaAttCAaaacgagTgAAAAtCtAGcGgAAAtagcAaAtTcaCcAgGaGCgAttgtAcCGaGcatAcAG +cttgctcAaaTTtaAtATGAGAaAATgActCGCtacctaGCgTgTGcaaGGtGaagcTAGccaGGcCTgtaccGtCcCTA +TAGgGGggAtcaATgGaCAcCTGccaTgcgcCttgTCctACgCCTCGCctTcTTCtCcCAAaTGAGGgTcAagCAAgCCG +taAAaatgGcTagtcgGGTcACCtTcTAtgAtCcCtaAcaGgCGcCaCCAgcGTTtgTgCaGttcaGGacCAaaTCaGgt +AaGTTgGAtctaaTaTTCCcgcTgagAtTTgGcAcatctCctgtTctAagATtctgACcTccCCtcAggcgTATCgtCgA +aGGcgCaGGCCtCCgAatGcatcAGagAAtcaaCgtaGggGaCgCGAGcgcaCtATTgAtCTaagtaCaGTggcGTctGg +aCAaCgtgTgGctaTATgtaacctAGaAACtttAtaTGGCCcgaacattGGTaAAcaGggCtGtctATGgcaaTtGgGGG +tCacTgcActTtcaGTtCcGgtaaggTatCACttgATAaaTCCAtgGGtaaacaCCAGAtaTgACagtTGAcatttAgGC +CcGgCccCATcAGcAtgcaTCGTAtTCCAtCCTcCTAtTGgtAATaTcctTggCtctctAAaCGaCTAcAccCgCggaca +aaACcGGgTAAtCGtcCTATCGtGgTATGTtcttTcTtGtAGtcTgCTtCtatTagGgctgaCagaGCaTcgTATatGCt +TCcaGaaTtgcAtCTTcAGtcGgtagGGggcCatcTCGGCcgcacccaCgtTgCaTTggtAGaTtgaGagACgGggTCTg +taTcaAAtAtaAGCGAa +>3 +tgGcAgtGCgcATAgggctCtTCgCggCCCcatTGtTAGaAgCATgttGaAtccTgactTgagaTtggtACgTaAgTcAG +tGTTGAaaAGaTaTaaCAacAaTCTGagTagAtTCGatAcTaCCGttgCATagctgTaAataaTaTCaAaatccCTagTC +AgcTgAgtcCttgTtGaTccatCgactgCCTttAtaTgcTGaTcACaATacCTttTcAcAacATaacgaccaaaAATaTA +AGGACGactATcTTccTcAacAgATtGgTtcCggaGaGtAaGGgtgATaGGgtTCcTggttgAGtacCgAaGgAgtgtgt +atgGcATcTgGGCTcATcTGacagGCtGGAtgGCcttctTgGGtgggCtAtTGtAAatacatgaGGGgGgGCAgCGttTC +aGaacCgACTgGTgCGCtttAgACGacgCtGgaGgacgGgtGAcCAaGgAGAGtTGGccGtAaaGTAgGGCCAAgAATta +gaTtCCtgCCCtAagaCaCCGCtTcATTCaacggaCCTTGAagTGtCTGaTtAAaCgctgaCcgGTAActaGaGGttacT +gGcggcaTtgCTCGcATGCAcTgTCaGTgcCTGtTacAccaaAAgcCcCATttaGcACagTgttgtTTGtcaAaGTacgT +atGcTTCCtAcaGCcAaggGCaGgAgctGAaAAAAaatTatAAGggcGgAgAcaAGgcTgaCcgAAttaAGtGaAcCGTg +gtTgaTgAcTAtTctcgaAcggCcGcAAACcAGTaAtAttgcggTGtCGCaGcTaTccCGtGtAgAgacTTGGTgTgAAg +CATGgCTTTTACATaCaAtcgcaTCtCattcCtgCCTggTTTgcTGgCcgCtACtTtCaaCTTccCAGaTgcagTAcgtt +tATGTTgGGtaaaaTaGGaTaTaCTtgcgTgtCtCctCCtcaatgGgggaGctGAAtGcAGTTTCCGATaGTTACTATag +AAACaAAtggaCAGtAAgGgGaTaATcCCcTTtattgCAtATCTagtcCCtAAAgCCgtaagacCcaACACctcTGagaa +TACTTCgTTGgatcCAaGGCAtCcAtCtTaGTgTAGGAtaCCAGGAGTgggaTtcAActtgtGGTgCgGaCacCaCAaAA +cCgtgctGGgcCGCtCcttgggTAAggatGgggacCtgCgAacgAcTCcatCggGGAGcAGaCACTgACtAtAaTatGGC +tcGtGCTTAcgtgcAtAccTgCAGtatAGTgGGCaGtcCaggTCcCaagTcGacgGtcCaaTGAagGGtagACAATTTat +GTcAttaTTTACGCAtGttGgCaaAGaCgTCCccCAgTTgtAGGTATaAGcAcaATTataaCcttgcaCtATCatCGaCa +atgCGTAcCagacGCtcatCAaGaGAcTTaCtGCtTActGattcgTTgAcGaTGAaaCTAgcGCAcAgTcgGCtAaTTgG +aAATggCGTACggatacTtCgGtaTgggTGacTctATAtaCTcgaAAtTAGgaattccAtcaatAaaAcATTatatGggc +GtaATgTtGAAATCctTACgggTattTTAcaaggGTcACTCTAGtaCAcTCGATaacgGATGCgggCcgaTCTaGatctt +aGGgTcgaTTgAactGcTttcCtgCTtaTTtacAatTcTaAtGGcattcaCaagCgaCcGTAgCgaCgatCcgGTaTAaT +ttaGcatatGaAgTAggcGaGgaTCCAacgGaaAgCCggCgCcgCcGTtCAAacTcgaGGtaAtaCTgcaTTgAAGccGt +AggCTtcgcGAtGcCcaTaAATCgTtTaGTcAGgcGTaCacaaCTAaGatctttctTTCTGtcaatCtGCaaTgaAGCGg +CacTAGgCgTtaaCgGCTgTgagGGATGGTCATgTcGCTAagGcCgCgttGtAtacTgttAaAtcGCTCtTcacGgcCgt +TTgTcatgTgctcaTTtTtAtgcTaAgcGgTgcTgCATGGggACaAAGAtTAaCAgcTAaTcCTTtgCgGcgTaTaaCgt +AtGGgTtAGtgCttgCTtTggcGGcgCaaatctTcttTCagtGaTaGCcTTCtAgAtCatgTgAGaACgaCagCaaTCaa +GTAgagatGctAggCatCggGgGTACCcaTAcgCgtaccTTcatTTgGGCtCAAaatAacaTCcAACcTCgGtCTGgCTt +GTCagCtAtcacGcCtttcaTggGctCgggtgCcgAGGAAGTtTtggAGGctCTCgCcCAtACgTaTCGcAtttcgcatt +CgtGCAtcAtttGATgtgAAcGcAatGcATcgATtttTctCCTTGtGAAGGttTaGTtAgTGgtTGgcGActgGCaCaTg +aCACtCTgCTaaaCcatgAtTCGGTGtttTGGtcGCcAtgCcAcGgAAgTagagcTATcaTttcgTAaaATatGctcggC +tGTgCAcGcGAtGcAGTgAaGtGttatgaTAcccGtGgaGGGtTtCgtTactGcgAagACCCgaAAggCctAaaGAaCat +tAcATtTcCGctTTCAcgcCCtgccAGcagGAaAGAAtggCaACTgGGCTAcGCGGGtcgtAAtggtGCaaGtATGCgCC +gtCagtgtCgtGccCacCGtgaaGaaAAaaGAcAagTGcGtGGaTAatGtTccGTaTTCGaGgCTGAaTcTTTAcACGcg +TTgaCTCcgTAAcTaGAgtGcccAtcGGtTTAttCtTAccgAaTACGagactGAacgcCCTgcTtGAcaACtaTaAAGcC +GggTTGatgctaatcgCagaAGGGAaGgTtTGaGCGaGattAAaGAgaAaCgAAGAAAgcGatGggttTcgAgCCCaCcT +cATAGAGCGCCacaagGaaaGCtGagGttTAcCGGaTatcTtCGGaTaagTGcCTtgATcctCcGctatcggGCaaAatA +TTTaGgATGAtggccCGccaacacAtAcAgaTGCAaCGtAcGcgaaaaTTCatTTatTagGAcGAtATgtGTAATtaTCc +tGcgaggggCTcGCcAcGCcGttGcaGACgAtaaATATAacaTAGTggacAtCaCaCaaATCTTtAAAccTCgGCaGcgA +TtTCGtggTAAtctAaatTcAcaCCaTtAcaCcACACGaAATAaCgGgtGtaTTtttCgGGttgTACgCtTctGTggCCA +CtgggtTTtGTactccATATcCAGgtAcTATggTaTccGtctGccAGCGAAagTAaAAgtAtgggATAgATgTcCCCgag +gAttgTcTtAcCGaCGaCTCCGTatCgGCcgccTaCcGtATGAcCAaGTCCaggCTcAgtacgAcCCcaTGAGTGGattC +TaGTtGagGcTtttggacTgGCgTgaTCTTCcGTgACcAGcgGCtCTcGCAGatagcGCAtgTTgGAgcCcgGCcaAGGc +GTGGTaAaGTCGGgtAtaAAggaACggTTCACaGgGgTccGTACttggcaAGTTgcttGtAATTgCGAcgCAAtcCGgaA +ccCCgGCGgtCgcCttAcGtgCgGcTCtgTgATATtgtAgTtACCccTtgGCAgacCgaCCaacagGTcgttaAGaAgcA +ccTtGtcTGcAcgtAtGcTaGcctccTaGttCGgagactaCCATGaccCGGGcCgtTTTaGcgTAGgaTCCcTtgtaTaG +cGCaTCatGgcCTgAgaaATtctAGTcgaTCAaTGcAcGactTctccaaTGGGGCgtcCaaGCcGaatcgTgaATTAggg +CtaAAaTgtATccGaCcTgCgTgtGcGggTtTaTgGCaactCTgCtcGTTggATAtGTtcTTtgAtCaCTaCgcAtCgcT +gTATgCCCcTaggttCtTaGagAtActaaCATTCtcatcTGCcCtagAAtaTaCtCGaTtgaccCtatAtGgCgcaaGca +ATcgGCttTGACTgTcGTATaTaGgGggatAaGtagaACattCcGAAaACCCAcTtagAAtgcCgtACtCactTtcacGG +cCgCCcCGCTagCcAcGcAaCgtCgAGTgAtgCcttGccAtCtAgGcgAaGCtcCtGAcagGaCgcCTttAtCagcggGg +GtCtGATCacaCCGgTCgcgacCTGAAGgCAtAtTTtGGTTCcTGaCgtCctaaGAacAaTaTTcaAgAAGGaTTCagTt +CACgAcTTAGggAgtAatTTGaAGacATCTaaTAaTtaagatGAtCgCGTtaAcAatCcGgtgAcAaGgaTtAtcCaTtc +CTcatTgTAcctcGgcaccTaGGagaGCGAaatTgTACaAaAGtcAgaaTGaTGGgACCTTcgCgTtTGagcgtAgGtTt +gCtaGcGggaGttatCTgatggCCgcgAACTctCaTCAaCCAGtaAAcCgaatAActtgCggTCtcGGAAtcCTaCaGCa +AggGcctCTtTgTggCAGGggcgtTTgatatGgTGactACcCGAgGAGAAACGTtgcgAaaAcATtttGactaCagAgCA +gtGctTCGTattaGtCTCTgAcGCGcaaaCGGTCgcaaTAgTAAaaTCGgCtGTTgcAacTaTgTtGGgaacCTaTGgTa +GgcTGCtTtTgcaGCaCTgCgGAaCCCCcctTtGgcGtGcgCaagCcttATGTCGCAattaAGGccgACGgccAgttagg +CcTGtGGaAttTCCGGGAgtCtgCgAAGtTaTgcTCGcCGTAgTtGCagtGAGgtaGgCTgggaTcTTaAGgtaGTcgAc +CTTaAgGagaAcCcaGACGGaTTtacGTGagAgAAagccatgGcAtTaTgATAatGACaACGacTgccggaTCGACacct +gcTtgTcACatCAcgTATAgTCtCCgAgCCccCCgatGttcCgCGgatGaagcccatgtcTGAtatCACacGCCaCcacT +GCcTtCaATtaAtaAgacGtcgATCGaaggCgTaacGgtaaTtaaGtAgAGcgaacaggTcCTGCtTGTGCCgGtCtCCa +TTacAcaGcaAgaGcgccTtacCtTTacgtgACCACCCAcTGTGAGtagACaAAGaAgtgCaTtaactGTgAGgagTcaG +ttcCcgACgGgagaAcaaTTAAgggAgTtaaAaCaCcgGtTTcGTAtcCactCccTCaTAactgTAACCtTcTaGGAGaG +AGCAtCcAAagGAgCcAtGCcaaTcGttAcTtTTgGTCCATcGAtggctaggGCATtAcacGgTTCGtgGACaggctcTC +AGcGgaCAcGaaaaccACCTTggCAAGggACTCTGggACgGCaAcCAaatatgGggCCgtacTagaATcCgCtTTgCttc +tCctTtAATCgaAaaccCAGggtagcaaatcccCAtTgaGagTcaTCTGTgGtTtAAaAtggGGgTtccAAATacAaGGg +TAaTaTatGaGgAacAgAatcgACTTTGCAGtatataAtGGcagttatgctgaTTaAtGcaCAGgaCagAagAGcaAaTc +gaAAaAtaaaATTCtgTaaggGcTgcagTaTGtcCcTATACGCTACcaTTAaTCagTtgcTtGaCgGtGacaCAaaCtca +aCGCgCaCAttAaGAgaTTAGcGCaCgTacTAGAGttaTAccatTAAGCTGtACgGGttaTTTCTTagTTcCAcCCGttA +agaAGcgtcagTTCGtatGGggATgcaAgTCACtcgatgcTtTaCgaTgCTCGAGcTcctaCaaAcTGAACaAATccaTt +TcTTGccatTCccAcCAAtAtGTGtctcCCCaAagtgaCGTTgatgtGTCtAAaCAACcATGGggCTgCgtGGCgtTGTg +aggatttTgCatcacgtcTggGAAaTgcAAATcgatcCAACTGCTGCAtaTCAtCCAaaGCTGAAtTAgcCTTTtCgcaT +gGggcTaATCcttgTgtTaTAAtTGCgagTcAgTacgtaaCGGGcCGaAcCCCgtaGCatTatGtGacttTgTgcCCAtg +CaCTgtGATAGTgtgTcTCGTaGggAAGaGCaggtCaTGGaaAtTGtCTAatTtAGGCAgAtaCtGaCaCCTcgtcTcTA +gaCTGcCaaGCTtcttTAaAGtgttTGtcgGgCgCacgAtgGTCcAAtcGtAaGAacCATCCGagtTaaTtGAgcgCAGA +CActagGgGCTtTGgGCTAtaGtGCgtCATCAagtaaTatGAAaCcgCgtTGaggCCttAAAGacAcgTCGGGtAtAtTA +tcAGAGttcCGTAcTaaTCcAaGaatCAgatCaCcccTTcTCGcTGTCcCcACGGGCGttAtgctTGTgaagAaCgTCAG +CcCAaAgTAACggCcgCgCTTctcccCCaTCTctggttgTCgcgaGCccgtGaCcCCtatTCaCcCgcAGTcAGtCAGgG +ACagaCGCaGtTGcGATgaCtCGcAGagCTAAatCctAcAtGcacttaGgCGcaaTCgGgaaCTaaaAgtgaaGtTtTaG +CATTAGaCaagTcgCAaataGTTcCGaTgctaCctCtccGcGTgGTaGcCGTgtgTTCCAtCTAagGctCgggtgTcgGC +CaAaCTTAAaTgGgtAagTggtGgCgtcTaCaCCtgTTTgcCacgccAccGCGttTataaCCAGCcgCCGcGagaggaCc +GaTcTcGtaTccaAtCaCAtACTaCTATGGATtgTtCAccCCaCAttTtTcGaTCGtCgGgTttcGaTgCtgGgAGTtcC +GcGgCCTggaACTTaaCcGAAAcCctTGGCcaAaGaATGCGtATgGtGtAGAAcaGcCGcACgCAAtccgaCaAcaTCat +gcAccaCaTttaTCtCatCCcaTTCgtTGtAaGacCGGAccGACGgAccTACatagTGGcAggGtCAcgCtGtgtgGAtC +aCAtcctAGaGGcCaAGAACCgctGtGccCgcCAGTtCTaGCTCcggagtggGgTgTtcccGtCacGcCTgGgCaAtTgT +ttcgTATGtAGCcgatttCCTggAGTctCggGaattCaAaTCcacAGgaCTcCtaCAccgcCtGAtacGgagacCCcatt +GGGGCaaGGCGCtcGGaAgtACgGaCtCGCgAgAaAtaTatAAcaatcacGgGcAccTGGAaTTcgCcAcATggCtAaCc +CCggtGcaCGggCtTaaaGgtTgactATCaCcGAGcggcATaGTCTCccgATcagTgtaaGTgCGGcgCCGgCCATtCct +tCgggTGcttaaGGtaGagTaCcGTCGtTTaGtgTccgGtGgACGCaGatataCAtAagGGaTctaAATAagAaAcGgta +ACTcGggtcaTAgGaaTATgatGaTATacTATGAaatCacCAtgCGagaaGCATgCaGtGTCTGAACGAACcgAtTCGCT +aCGccGTgCgtAaaTGGagTgGTaCactAGAggTGCgGatGcagGAgccAgTCGAGaATTGTaacAgcACcgAtaTACcA +tcacGCGAcacAGTcGCGCtgACAaCtatcGgatCGAgcTgcCgCcaAgaAAaTtAgcTgGAAGcTATGtCgGCTgAGAg +gCCaCcTCTACGAGcAccagaaAtGcTAatTGCcaACGcCTgccCGTgaCCgtCttaCaaaaAAAtgTcGtCCGggCgGg +aAaActCTgtTAAagATGgCAcGcctTgGGaaaCaGGCcttCCCaCTCgcaGCTccaccaTAGCCcgTGGCCgtAAGCGg +GAGGtaCagCgGCAaAggcGcCAGtgtcCCActTCtTTataGTAGCAACcttaGAcCtaGgtTTaaGaGGACTGGATttC +gtTtATgGaGaAtggcGaTcgCccGcgtCCCaaaCaCAGtcAtAcaGcaacgTctTTctAaAACtCtgcaAAGttCcGCC +GgTTaggcTaaCCtGacTTGTTttccAGACCtggaaggtACGGtgTcaactGcGACccgCCcATAcCtCCcGaTTAtcCT +TTCAtCtCCCgaAGGtgACTactccCgagCaGCGtGTTgatAaAgtGatCgttActAaGtCtTttcGcaTATCCcAaggG +TCTTCGGAcatcCgaagaaCATgaAACGgagGCcAtAtcgCtcAACAACtttctccgGcTcGTctCAaacATacaCaccg +TAactaccgtAcCTTcgTCtacCtGaTtGTtAatGAAGcaCcAtCCAaAAGGGgcTtTTatAATtTGCaCTtAtcATGGg +tcTaGaaTGAtTGcCggTtagAtGtaTAaaTaAcAAgaCacgCGttAatgAAtcCgTCcagTctgttaAacGGTAtaGAT +aGCGaaaTAcaAcGaGaAGgacGTCTGGTgtAcGGggACcagtGtcAatgGATGaaTTATgAtaagtcATacCCgTGaTa +agacATcCGACGTtCccAgcaGttgtaAgtATTTcACATAAaagGTgcGTtAGCccGtataTtgAggcTaAgCAcgttcc +gtACaTtCggAaTTCcaTaatgACgTaCcTCgaCgtACCcTTAcgGAgaaAGtAATtatcaTTgaCGAaGTAaacaGatC +TGGagTTaTcttcATgAaaTCatgTaAaCAaaGgaCTAtcgtgTcTctaCCctACgtgTATAtAaAtGagGgaGccttTg +GaccgcagAtagagtGcacattTATaTCTcTtgGAcGaACAtgaaagaGGgtcATcacgACcAAgagAgGGaTCgTTCgg +CAgTGCCCACctAAGATccgagATcCtgTGtTTaGgGaAAGgGcATACcCAGAAGAAAcTgtGAaaGAtAgtcTCTTGcG +catgcCtgGgAgATaGGCtccctTCggCAAagAGgCggTattgTtTcGtTGacCcgggtGCaTTgGtCggttGAATtgGc +CtcCctgTtgcTG +>4 +CGgtATaTacgcACAAaAacGcAaAgTagAcTTTTggCgGtaaGaCCtaTgAGgcCtagTctttgGaGGCcTtgtAAaCa +gAGtcgaCgGgCTGccGcAtGGATCATTGggGCCCGGgCTcTTTtTAGcGaAataTgAcgaGtttCTGaAttGtcTACCA +GaGGCgTGacGTaGgtGCaGGTAtAatatAcacCTGgAgAcgCCCGtAgActaTATcCcGGGgCatcCcgaccctacTGG +TtGCgcGTggATGaAattagaGcACtTTTacCaGGTtACTaATcACACgACGatATTTGcaAGTaGtCcgGcGCgatGtC +AtATTCcccagACAaTGTaggcggactTCgtATaCgGCTcTCcAcgGtttgtgAtcgcTCcTgGaGaCagcAGacAcgCc +CatTtAGAcTCacGGGCCtTcgtCCcacctctTcacACccAaAtaccacTtaTActAtAttGtaACaCgTgtcGaACtTG +ccaTgatAaGCACaCGatgtctTCAtaAaTgGCcACACAGcCGCtTAcCTTaGaaCatAaatgGGTgATAgggtcgacGC +gaaCCTaTctctTCgtTaTttatGaTcAgtACaTggTcgTcaGcCGccaAgTtGtGTaTtAtGCttTaaGgAATTaACAc +gAtAGTAgTCgagtttcacGAcAAGcTCCccGGTCCtgctaaccTAcTCgGcgttAtaACcccCTtcTCgctACGCTAga +AAGTctCTCgAgtaAtCacttTTcCCAttcGaGCctGgcgAtATcCGtTTTcTagaGcTTAgacCCCccTAggcCtAgat +CtctCCTACgtTGgcctcTCGtgAcgAccAaAggTAtCACCTaACagaGtcAAGaCGtaCTTCAcatGtcatgAcTcCaT +agGcattgTACaccGcgcAGgcACtaTGaGCGtggGggAtAgaTgcgcCTGAaCttGAGtccaCAGAgGAACGAAcTaAC +ggAaatTGcATgCGAcCtcCAGgtgtCcActATATTatGgACCaaGgTccGGTagtgTTCTAcTATcATCacaCTgACcc +cGcgcgAgGCCTttATAgcAtcCCtaaCctgcAAaCgATgcctGTCGgTcatGtcCcAAggGCCcTgATAccgaGAgatT +tAtgcctGCgCACttCtTcAagCggaCAGCggtCcgTGaacGgtATCaccatGATctcatAcACaCtcCtAGtAcGcATG +TaAGCAgcgctgagtAGgCgGgGctTagCTGAaCGcggAGGaAtcGtAtAagAGgcCTcTccCagagtCATtCCaActAa +ccACCTggCAaaGGCCctCatCaCGtGGcGgTgcTTgCAcTcTaAcGAAGTtGcctcaccgagGAcTgAaCGGCGcagtg +cCgGcCActccCAgcCAGgAGagGGttTcatAaTCTttCGgTAGctttAAATGTaaccttActtTgCAtGTTTgAgaGtC +CaAAGcAtGAtacGGTCGggGcaCccAGGtTGTACGgcGCACgTcTttTCCAcggTtGgagaaACcACgccaCGTcGTTc +TtGCcAGACAGTcATTTATCaccttCACTCtCctTcgtAcaCcaAaGtgTTgCTAcAccGGATtGTAtcTccGcTttcTa +tAacCGagtgTtCtTgggacCGGcccgctcCgCGGaAcAtCcttttTgctcgATGTcTcCgatgcCGTcAgcacActGCg +CGCaAGagTGtgAACGtAaGaaACCAttaccTAtTATcTgtAaAagcTtgaatgaatctTtcTCTAGGGGTgtGgacagc +TCTcGaccTccTatgTTCttcTCcGTtCTTgatTATtTACCaCcggag +>5 +GGtTaacCtcatcAcaaTGcAaGgTaCAgtcgACatAtcgTACTgaaaCTTtCttCccGcgCaGttggTGCgAGaCTCcg +ccaAgTTGgtAgTTTCAGaTaTAccTcaggTtACGggTgCGGaACACCAGGtgGCatggcCggCGtgGGtGcagTtGTgC +TcaaCATcTgAgCaAgAccaTtaGAcGaGtTGGCgTccACggtAATtttGggGCtgAtCcgggGaGtcGAgtTtTGGGgg +TGcaaAcTGgCgTGaaATtCtcAgaGaaCaAGgTtCaatAaGattgAagaggtTCacTgACTCCTgTTGCAACCCtCCGA +cctGTcAGccGACTgTaAtttTcggCAAGgGAtAttCaCtCTAcCgcctctAatGggaATAGCGCcccCCttCGaCCgat +AtAcCACcaattCggaCtGtCGCCCtAtttAataGCTgctgTcGcgGCtAcGcccTGTTagcGagcGAttCgtAAgtaCG +CCacAGTAActATAaGGCGCcTCtcaaAGAGAGccTtCtATcAtagTTtTtCtgTaTGTAtcGGGcCacCgggTTcTATg +TTcTGAgCCgacCcAtgGTTAttTCgaTGgCTgcTcagAAAcTGaGgCGATTTCgcattGtCAcagttACcGTTAGTtTA +AGGTGTtCtAAcGACgtGTCTAtAGaGGCactGgaCaCctgGagCCgCttaATCtgTCATTtTgagCAaGGCaAcaAGCc +CtCaCAtcacCcatGTtTGCcCAagCAtCaTagGCTGCcaCGcCGGacATGCaaAggGGCCtGAGAtCCtCtgCaAAggG +agtgcacgcatgGtGtGatgTggTcCaACcGCAgGGgTAAcGCtgaTTTGCcAtctTactgtTcTttaCcCCCTACgTta +TGACaCAAtcGCtTttgccACagAACGggAtTCAAagcCAtAtATAcccgTcaAtGGACtTAcctCcgacAgTCcCTCGg +GgCCtgtTttTAgGGCgTTGcaCCTtAtTGgAtaactaaCCaAaggtaGgGgcaCtCtgcagGAgCcTcCaCaACGTgTa +GcgCAAcCtcGcTTCtTGGGAtTtTcaGCCAGaAtACaaAggGcCgGtAtCTATaTccAacaTCtatTACgTcgGCGTgG +gAgcTaggATTcaCgatggTTaaacTcTTAGaagAcaCcTCCtaATCcATTTacgcAcAGcTGTtccGTgTagAgAaTAT +AtgATAGaAAgggACCTTgAtaAGgtaAAgcCaTAAgACcCGGGcCaGgtCACaTAGTttATaTccCAcCGcCCtACgat +CccCGaccatgcaAAgAAcctcgCTtGaGaCaTACtcTgtAgtCGATATaAtGcCCttaaTAtttAcTCCACGaggaTAG +TTccgTtcgCtGcTGTcCATgGcGaCcTcGaAcCgTCCgGccaACgtTGaAgtcgCCAGcgAAcCcTGgCgGccaTttaT +AGGaatcacGgGgcGGctAaaccgaTCcgTcgCtTgagcAcGGTATGGAttCgTtTAGtTAcCccaAAACGATAtCtGAt +AttGcaCtAAcGGAAtCTaGCAGTaTggCGcAATaGtACagaccCCAAtAcGGaTcTactGGTaTCGcCacTtaGggcGt +tTcGACCGgaaTAtcaGTCAcTcCCagGTGCAcggTAgtacttaCgCAgctaaatAGGgAtcAaacaGcCtaCccGTGag +agAcaGcTAGTaaAGgtACgACGacGtctTagCgTaTtTCccCTCTCTTTTacGaaCGACGCCaaTGgTGtTGGCgATAC +AtAtGGCTCgAgTaCgCatGTccccACAaCCCAaaAggGtataCAcAAAtATAGctGgACcggGGCatGAGttGtTcTGg +GcaaGcaatTCTCAgtGcCCaTCtGtGcccacTTcAgcCgCtaAGCAGgTAatCcacgAacCGgcgcCGTaatGcaGacg +gCGGcgctCcAAaTGaGCtAcTcaagcgGTGaTAagCTCtCCcaaacAAaaGTatatttaGTtacaaGAtTCaCgGtTTA +TCACACcgccCCTCCcCgGGTTtcTTCATGgGGTAttGaGTGTGACAGacccgtAGcGAAGaGGgATAtgTAtcaagCGA +GcgCtAGTCATccgtatTtACCAcAGAtaCGAcaTAcGTAGaCaAtccCCGAccCAtctTTgGCcCGAaaataCGaTCCT +AaCttCAtggAgCTTCcaTGGTAgGCcGatcgTCaATTGAcAAGCgcTGgCCCtCtGaCGCGCAatcCTTAcACTgaGTg +ttcATCCAaaacAgatCaccaCtCgTTCTGaaGTgTCGGaGaGtatGcaAaagTgcatAagGgcgtCTGgGGtcGCcAAA +acTaGGaTaTataGTatTAGTaGCTCacCgCGCtTGGgtcGTGTgCCTttGAGcGggAGtTTgaCGcgccTcATTagtga +TGaGcCgcagCcgcaCcaTccaaggaAtcCaAaaGaGtGGTTCcgcacTTCGACcCGcaGatgGgGgaTgTgcCGacgCC +CCAatTCccGGtAgcacTGCacaTataGGTtGCagATtgcccCAGcggcgtgATTtTTgCCGaaagTcTtcCagTTaTTg +caTTCCGcgGcatacAgCTggccgTcgGaCGAGgaatcaggcagGGgGaggGGgAtggGtAtctatctACTTGgAggCcG +cTaAGACctTCtcggacCatattgcaAGGAGTTaTaccTcAAccCAAGTCTCgacCCTCAGccTaGCggCattGaTCGcc +tggGgCACtAaactGCctGggtgaCGAgAtaCgAaGcAgGTAcaCgcaGgATgtcGCTAtGgGaAACAacCacCTgcAGg +tATAtaACGAGAatAgGagTATTatGATgCcgCCGCggTaTATaCActTaaTGcaacgtTggTgcTaaAagaATGgCTTT +cgATgCCTgtaCagGGtaATAAGCgTcATCCaaCAttggtGCGgtgTCTTaTggccTacCAaGaTcggcgTGcTTttcGG +cGCCacTgtGccgTggaTTACtcACCaagAtAtTAgCgGGATcATctcgCtGAccCCGcCGGaCGcTcTtTaAGCCtaAT +CtTcTcctCacTtGtgGCtTgAtTTcTAGAAGgGGgcGTgAGcGtGcAAcgTcCTtAAaaactTGtttCGcctGagTCgC +AacGCacTTAGacCtaacCTcACTgGccGtgGGtTcTgAgatcgcAcAAAaCCagGAaCAtgtAaagAtccgGaCTaTAT +gGCaAagCgcaatAgCtcTcTTTGAGcgTCACACgtGACggcggTGtCCCgcgcCGtGcGTcGtcGGtcGcaagGTTcCg +AaGCtaGgCgccagCgTctaGcaCtcTtaTtgggtAATTTGGcGGAcacGgaGCagacTTGGtgaaGTGCAcgTtAAGcG +cgggCgaGTtATtAtTCAttgtTTTtcaGTcAgtTtATccATtgaCCAAa diff --git a/t/data/test3.fa.fai b/t/data/test3.fa.fai new file mode 100644 index 0000000..4eae6e0 --- /dev/null +++ b/t/data/test3.fa.fai @@ -0,0 +1,5 @@ +1 5869 3 80 81 +2 8417 5949 80 81 +3 8653 14475 80 81 +4 1808 23240 80 81 +5 3410 25074 80 81 diff --git a/t/data/test_analysis1122.yaml b/t/data/test_analysis1122.yaml new file mode 100644 index 0000000..4d11193 --- /dev/null +++ b/t/data/test_analysis1122.yaml @@ -0,0 +1,43 @@ +name: zmp_ph1 +chunk_total: 3 +read1_length: 30 +read2_length: 54 +mismatch_threshold: 2 +bin_size: 100 +peak_buffer_width: 100 +hmm_sig_level: 0.001 +hmm_binary: bin/quince_chiphmmnew +r_binary: R +deseq_script: script/run_deseq.R +output_sig_level: 0.05 +ref_fasta: t/data/test12.fa +ensembl_species: danio_rerio +samples: + - + name: zmp_ph1_1m + description: ZMP phenotype 1.1 mutant + condition: mutant + group: 1 + tag: NNNNBGAGGC + bam_file: t/data/test1.bam + - + name: zmp_ph1_1s + description: ZMP phenotype 1.1 sibling + condition: sibling + group: 1 + tag: NNNNBAGAAG + bam_file: t/data/test1.bam + - + name: zmp_ph1_2m + description: ZMP phenotype 1.2 mutant + condition: mutant + group: 2 + tag: NNNNBCAGAG + bam_file: t/data/test2.bam + - + name: zmp_ph1_2s + description: ZMP phenotype 1.2 sibling + condition: sibling + group: 2 + tag: NNNNBGCACG + bam_file: t/data/test2.bam diff --git a/t/data/test_analysis12.yaml b/t/data/test_analysis12.yaml new file mode 100644 index 0000000..d0adcba --- /dev/null +++ b/t/data/test_analysis12.yaml @@ -0,0 +1,28 @@ +name: zmp_ph1 +chunk_total: 3 +read1_length: 30 +read2_length: 54 +mismatch_threshold: 2 +bin_size: 100 +peak_buffer_width: 100 +hmm_sig_level: 0.001 +hmm_binary: bin/quince_chiphmmnew +r_binary: R +deseq_script: script/run_deseq.R +output_sig_level: 0.05 +ref_fasta: t/data/test12.fa +samples: + - + name: zmp_ph1_1m + description: ZMP phenotype 1.1 mutant + condition: mutant + group: 1 + tag: NNNNBGAGGC + bam_file: t/data/test1.bam + - + name: zmp_ph1_1s + description: ZMP phenotype 1.1 sibling + condition: sibling + group: 1 + tag: NNNNBCAGAG + bam_file: t/data/test2.bam diff --git a/t/data/test_analysis13.yaml b/t/data/test_analysis13.yaml new file mode 100644 index 0000000..78028f6 --- /dev/null +++ b/t/data/test_analysis13.yaml @@ -0,0 +1,28 @@ +name: zmp_ph1 +chunk_total: 3 +read1_length: 30 +read2_length: 54 +mismatch_threshold: 2 +bin_size: 100 +peak_buffer_width: 100 +hmm_sig_level: 0.001 +hmm_binary: bin/quince_chiphmmnew +r_binary: R +deseq_script: script/run_deseq.R +output_sig_level: 0.05 +ref_fasta: t/data/test12.fa +samples: + - + name: zmp_ph1_1m + description: ZMP phenotype 1.1 mutant + condition: mutant + group: 1 + tag: NNNNBGAGGC + bam_file: t/data/test1.bam + - + name: zmp_ph1_1s + description: ZMP phenotype 1.1 sibling + condition: sibling + group: 1 + tag: NNNNBCGCAA + bam_file: t/data/test3.bam diff --git a/t/data/test_de.yaml b/t/data/test_de.yaml new file mode 100644 index 0000000..4a47940 --- /dev/null +++ b/t/data/test_de.yaml @@ -0,0 +1,75 @@ +- + name: count_tags + default_memory: 3000 +- + name: bin_reads + default_memory: 3000 +- + name: get_read_peaks + default_memory: 3000 +- + name: merge_read_peaks + default_memory: 50 + prerequisites: + - get_read_peaks +- + name: summarise_read_peaks + default_memory: 200 + prerequisites: + - merge_read_peaks +- + name: run_peak_hmm + default_memory: 300 + prerequisites: + - bin_reads + - summarise_read_peaks +- + name: join_hmm_bins + default_memory: 50 + prerequisites: + - run_peak_hmm +- + name: get_three_prime_ends + default_memory: 1000 + prerequisites: + - join_hmm_bins +- + name: merge_three_prime_ends + default_memory: 50 + prerequisites: + - get_three_prime_ends +- + name: filter_three_prime_ends + default_memory: 50 + prerequisites: + - merge_three_prime_ends +- + name: choose_three_prime_end + default_memory: 50 + prerequisites: + - filter_three_prime_ends +- + name: count_reads + default_memory: 300 + prerequisites: + - choose_three_prime_end +- + name: merge_read_counts + default_memory: 50 + prerequisites: + - count_reads +- + name: run_deseq + default_memory: 2000 + prerequisites: + - merge_read_counts +- + name: add_gene_annotation + default_memory: 3000 + prerequisites: + - run_deseq +- + name: dump_as_table + default_memory: 3000 + prerequisites: + - add_gene_annotation diff --git a/t/gene.t b/t/gene.t new file mode 100644 index 0000000..57f1d56 --- /dev/null +++ b/t/gene.t @@ -0,0 +1,119 @@ +use Test::More; +use Test::Exception; +use Test::Warn; +use Test::DatabaseRow; +use Test::MockObject; +use Carp; + +plan tests => 54; + +use DETCT::Gene; + +my $gene = DETCT::Gene->new( + { + genebuild_version => 'e61', + stable_id => 'ENSDARG00000095747', + biotype => 'protein_coding', + seq_name => '5', + start => 40352744, + end => 40354399, + strand => 1, + } +); + +isa_ok( $gene, 'DETCT::Gene' ); + +# Test genebuild version attribute +is( $gene->genebuild_version, 'e61', 'Get genebuild version' ); +is( $gene->set_genebuild_version('e62'), undef, 'Set genebuild version' ); +is( $gene->genebuild_version, 'e62', 'Get new genebuild version' ); +throws_ok { $gene->set_genebuild_version() } +qr/No genebuild version specified/ms, 'No genebuild version'; +throws_ok { $gene->set_genebuild_version('#invalid#') } +qr/Invalid genebuild version/ms, 'Invalid genebuild version'; + +# Test stable id attribute +is( $gene->stable_id, 'ENSDARG00000095747', 'Get stable id' ); +is( $gene->set_stable_id('ENSDARG00000024771'), undef, 'Set stable id' ); +is( $gene->stable_id, 'ENSDARG00000024771', 'Get new stable id' ); +throws_ok { $gene->set_stable_id() } qr/No stable id specified/ms, + 'No stable id'; +throws_ok { $gene->set_stable_id('#invalid#') } qr/Invalid stable id/ms, + 'Invalid stable id'; + +# Test name attribute +is( $gene->name, undef, 'Get name' ); +is( $gene->set_name('cxc64'), undef, 'Set name' ); +is( $gene->name, 'cxc64', 'Get new name' ); +is( $gene->set_name(), undef, 'Set undef name' ); +is( $gene->name, undef, 'Get undef name' ); +my $long_name = 'X' x ( $DETCT::Gene::MAX_NAME_LENGTH + 1 ); +throws_ok { $gene->set_name('') } qr/Name is empty/ms, 'Empty name'; +throws_ok { $gene->set_name($long_name) } qr/longer than \d+ characters/ms, + 'Invalid name'; + +# Test description attribute +is( $gene->description, undef, 'Get description' ); +is( $gene->set_description('CXC chemokine 64'), undef, 'Set description' ); +is( $gene->description, 'CXC chemokine 64', 'Get new description' ); +is( $gene->set_description(), undef, 'Set undef description' ); +is( $gene->description, undef, 'Get undef description' ); + +# Test biotype attribute +is( $gene->biotype, 'protein_coding', 'Get biotype' ); +is( $gene->set_biotype('nonsense_mediated_decay'), undef, 'Set biotype' ); +is( $gene->biotype, 'nonsense_mediated_decay', 'Get new biotype' ); +throws_ok { $gene->set_biotype() } qr/No biotype specified/ms, 'No biotype'; +throws_ok { $gene->set_biotype('#invalid#') } qr/Invalid biotype/ms, + 'Invalid biotype'; + +# Test sequence name attribute +is( $gene->seq_name, '5', 'Get sequence name' ); +is( $gene->set_seq_name('6'), undef, 'Set sequence name' ); +is( $gene->seq_name, '6', 'Get new sequence name' ); +throws_ok { $gene->set_seq_name() } qr/No sequence name specified/ms, + 'No sequence name'; +throws_ok { $gene->set_seq_name('#invalid#') } qr/Invalid sequence name/ms, + 'Invalid sequence name'; + +# Test start attribute +is( $gene->start, 40352744, 'Get start' ); +is( $gene->set_start(30352744), undef, 'Set start' ); +is( $gene->start, 30352744, 'Get new start' ); +throws_ok { $gene->set_start() } qr/No start specified/ms, 'No start'; +throws_ok { $gene->set_start(-1) } qr/Invalid start/ms, 'Invalid start'; + +# Test end attribute +is( $gene->end, 40354399, 'Get end' ); +is( $gene->set_end(30354399), undef, 'Set end' ); +is( $gene->end, 30354399, 'Get new end' ); +throws_ok { $gene->set_end() } qr/No end specified/ms, 'No end'; +throws_ok { $gene->set_end(-2) } qr/Invalid end/ms, 'Invalid end'; + +# Test strand attribute +is( $gene->strand, 1, 'Get strand' ); +is( $gene->set_strand(-1), undef, 'Set strand' ); +is( $gene->strand, -1, 'Get new strand' ); +throws_ok { $gene->set_strand() } qr/No strand specified/ms, 'No strand'; +throws_ok { $gene->set_strand(0) } qr/Invalid strand/ms, 'Invalid strand'; + +# Mock transcript objects +my $transcript1 = Test::MockObject->new(); +$transcript1->set_isa('DETCT::Transcript'); +my $transcript2 = Test::MockObject->new(); +$transcript2->set_isa('DETCT::Transcript'); + +# Test adding and retrieving transcripts +my $transcripts; +$transcripts = $gene->get_all_transcripts(); +is( scalar @{$transcripts}, 0, 'No transcripts' ); +is( $gene->add_transcript($transcript1), undef, 'Add transcript' ); +$transcripts = $gene->get_all_transcripts(); +is( scalar @{$transcripts}, 1, 'Get one transcript' ); +$gene->add_transcript($transcript2); +is( scalar @{$transcripts}, 2, 'Get two transcripts' ); +throws_ok { $gene->add_transcript() } qr/No transcript specified/ms, + 'No transcript specified'; +throws_ok { $gene->add_transcript('invalid') } qr/Class of transcript/ms, + 'Invalid transcript'; + diff --git a/t/genefinder.t b/t/genefinder.t new file mode 100644 index 0000000..ccd7a86 --- /dev/null +++ b/t/genefinder.t @@ -0,0 +1,311 @@ +use Test::More; +use Test::Exception; +use Test::Warn; +use Test::DatabaseRow; +use Test::MockObject; +use Carp; + +plan tests => 93; + +use DETCT::GeneFinder; + +# Mock genes +my @genes; +my @transcript_three_prime_ends = + ( [ 100, 1 ], [ 200, 1 ], [ 300, -1 ], [ 400, -1 ], ); +foreach my $transcript_three_prime_end (@transcript_three_prime_ends) { + my ( $pos, $strand ) = @{$transcript_three_prime_end}; + + # Construct start and end so $pos is always 3' end + my $start = $strand == 1 ? $pos - 50 : $pos; + my $end = $strand == 1 ? $pos : $pos + 50; + + # Create genes named by 3' end position + my $gene = Test::MockObject->new(); + $gene->set_always( 'stable_id', 'ENSDARG00000095747' ); + $gene->set_always( 'external_name', q{g} . $pos . q{:} . $strand ); + $gene->set_always( 'description', undef ); + $gene->set_always( 'biotype', 'protein_coding' ); + $gene->set_always( 'seq_region_start', $start ); + $gene->set_always( 'seq_region_end', $end ); + $gene->set_always( 'seq_region_strand', $strand ); + my $transcript = Test::MockObject->new(); + $transcript->set_always( 'stable_id', 'ENSDART00000133571' ); + $transcript->set_always( 'external_name', q{t} . $pos . q{:} . $strand ); + $transcript->set_always( 'description', undef ); + $transcript->set_always( 'biotype', 'protein_coding' ); + $transcript->set_always( 'seq_region_start', $start ); + $transcript->set_always( 'seq_region_end', $end ); + $transcript->set_always( 'seq_region_strand', $strand ); + my $transcript_far = Test::MockObject->new(); + $transcript_far->set_always( 'stable_id', 'ENSDART00000133572' ); + $transcript_far->set_always( 'external_name', 'cxc64-001' ); + $transcript_far->set_always( 'description', undef ); + $transcript_far->set_always( 'biotype', 'protein_coding' ); + $transcript_far->set_always( 'seq_region_start', 100_000 ); + $transcript_far->set_always( 'seq_region_end', 100_100 ); + $transcript_far->set_always( 'seq_region_strand', $strand ); + $gene->set_always( 'get_all_Transcripts', + [ $transcript, $transcript_far ] ); + push @genes, $gene; +} + +# Mock slice +my $slice = Test::MockObject->new(); +$slice->set_always( 'get_all_Genes', \@genes ); + +# Mock slice adaptor +my $slice_adaptor = Test::MockObject->new(); +$slice_adaptor->set_isa('Bio::EnsEMBL::DBSQL::SliceAdaptor'); +$slice_adaptor->set_always( 'fetch_by_region', $slice ); + +my $gene_finder = + DETCT::GeneFinder->new( { slice_adaptor => $slice_adaptor, } ); + +isa_ok( $gene_finder, 'DETCT::GeneFinder' ); + +# Test Ensembl slice adaptor attribute +isa_ok( $gene_finder->slice_adaptor, 'Bio::EnsEMBL::DBSQL::SliceAdaptor' ); +throws_ok { $gene_finder->set_slice_adaptor() } +qr/No Ensembl slice adaptor specified/ms, 'No Ensembl slice adaptor'; +throws_ok { $gene_finder->set_slice_adaptor('invalid') } +qr/Class of Ensembl slice adaptor/ms, 'Invalid Ensembl slice adaptor'; + +my $genes; +my $transcripts; +my $distance; + +# Near to one gene on forward strand +( $genes, $distance, $nearest_end_pos ) = + $gene_finder->get_nearest_genes( '1', 110, 1 ); +is( $genes->[0]->name, 'g100:1', + q{Gene with 3' end at 100 bp on forward strand} ); +is( $distance, 10, q{3' end is 10 bp downstream} ); +is( $nearest_end_pos, 100, q{3' end at 100 bp} ); +( $genes, $distance, $nearest_end_pos ) = + $gene_finder->get_nearest_genes( '1', 90, 1 ); +is( $genes->[0]->name, 'g100:1', + q{Gene with 3' end at 100 bp on forward strand} ); +is( $distance, -10, q{3' end is 10 bp upstream} ); +is( $nearest_end_pos, 100, q{3' end at 100 bp} ); + +# Near to one gene on reverse strand +( $genes, $distance, $nearest_end_pos ) = + $gene_finder->get_nearest_genes( '1', 290, -1 ); +is( $genes->[0]->name, 'g300:-1', + q{Gene with 3' end at 300 bp on reverse strand} ); +is( $distance, 10, q{3' end is 10 bp downstream} ); +is( $nearest_end_pos, 300, q{3' end at 300 bp} ); +( $genes, $distance, $nearest_end_pos ) = + $gene_finder->get_nearest_genes( '1', 310, -1 ); +is( $genes->[0]->name, 'g300:-1', + q{Gene with 3' end at 300 bp on reverse strand} ); +is( $distance, -10, q{3' end is 10 bp upstream} ); +is( $nearest_end_pos, 300, q{3' end at 300 bp} ); + +# Between two genes on forward strand +( $genes, $distance, $nearest_end_pos ) = + $gene_finder->get_nearest_genes( '1', 150, 1 ); +is( $genes->[0]->name, 'g100:1', + q{Gene with 3' end at 100 bp on forward strand} ); +is( $distance, 50, q{3' end is 50 bp upstream} ); +is( $nearest_end_pos, 100, q{3' end at 100 bp} ); + +# Between two genes on reverse strand +( $genes, $distance, $nearest_end_pos ) = + $gene_finder->get_nearest_genes( '1', 350, -1 ); +is( $genes->[0]->name, 'g400:-1', + q{Gene with 3' end at 400 bp on reverse strand} ); +is( $distance, 50, q{3' end is 50 bp upstream} ); +is( $nearest_end_pos, 400, q{3' end at 400 bp} ); + +# Near to one transcript on forward strand +( $transcripts, $distance, $nearest_end_pos ) = + $gene_finder->get_nearest_transcripts( '1', 110, 1 ); +is( $transcripts->[0]->name, + 't100:1', q{Transcript with 3' end at 100 bp on forward strand} ); +is( $distance, 10, q{3' end is 10 bp downstream} ); +is( $nearest_end_pos, 100, q{3' end at 100 bp} ); +( $transcripts, $distance, $nearest_end_pos ) = + $gene_finder->get_nearest_transcripts( '1', 90, 1 ); +is( $transcripts->[0]->name, + 't100:1', q{Transcript with 3' end at 100 bp on forward strand} ); +is( $distance, -10, q{3' end is 10 bp upstream} ); +is( $nearest_end_pos, 100, q{3' end at 100 bp} ); + +# Near to one transcript on reverse strand +( $transcripts, $distance, $nearest_end_pos ) = + $gene_finder->get_nearest_transcripts( '1', 290, -1 ); +is( $transcripts->[0]->name, + 't300:-1', q{Transcript with 3' end at 300 bp on reverse strand} ); +is( $distance, 10, q{3' end is 10 bp downstream} ); +is( $nearest_end_pos, 300, q{3' end at 300 bp} ); +( $transcripts, $distance, $nearest_end_pos ) = + $gene_finder->get_nearest_transcripts( '1', 310, -1 ); +is( $transcripts->[0]->name, + 't300:-1', q{Transcript with 3' end at 300 bp on reverse strand} ); +is( $distance, -10, q{3' end is 10 bp upstream} ); +is( $nearest_end_pos, 300, q{3' end at 300 bp} ); + +# Between two transcripts on forward strand +( $transcripts, $distance, $nearest_end_pos ) = + $gene_finder->get_nearest_transcripts( '1', 150, 1 ); +is( $transcripts->[0]->name, + 't100:1', q{Transcript with 3' end at 100 bp on forward strand} ); +is( $distance, 50, q{3' end is 50 bp upstream} ); +is( $nearest_end_pos, 100, q{3' end at 100 bp} ); + +# Between two transcripts on reverse strand +( $transcripts, $distance, $nearest_end_pos ) = + $gene_finder->get_nearest_transcripts( '1', 350, -1 ); +is( $transcripts->[0]->name, + 't400:-1', q{Transcript with 3' end at 400 bp on reverse strand} ); +is( $distance, 50, q{3' end is 50 bp upstream} ); +is( $nearest_end_pos, 400, q{3' end at 400 bp} ); + +# Check adding gene annotation required parameters +throws_ok { $gene_finder->add_gene_annotation() } qr/No regions specified/ms, + 'No regions'; + +my $regions; + +# Adding gene annotation +$regions = [ + [ '1', 1, 1000, 10, -10, '1', 110, 1, 10, [], undef, undef, [], [] ], + [ '1', 1, 1000, 10, -10, '1', 290, -1, 10, [], undef, undef, [], [] ], + [ '1', 1, 1000, 10, -10, '1', 100, 1, 10, [], undef, undef, [], [] ], + [ '1', 1, 1000, 10, -10, '1', 300, -1, 10, [], undef, undef, [], [] ], +]; +my $annotated_regions = $gene_finder->add_gene_annotation($regions); +my ($gv) = keys %{ $annotated_regions->[0]->[-1] }; # Genebuild version varies +is( scalar keys %{ $annotated_regions->[0]->[-1] }, 1, '1 genebuild' ); +is( scalar @{ $annotated_regions->[0]->[-1]->{$gv} }, 1, '1 gene' ); +is( $annotated_regions->[0]->[-1]->{$gv}->[0]->[0], + 'ENSDARG00000095747', 'Stable id' ); +is( $annotated_regions->[0]->[-1]->{$gv}->[0]->[1], + 'g100:1', q{3' end as name} ); +is( $annotated_regions->[0]->[-1]->{$gv}->[0]->[2], undef, 'Description' ); +is( $annotated_regions->[0]->[-1]->{$gv}->[0]->[3], + 'protein_coding', 'Biotype' ); +is( $annotated_regions->[0]->[-1]->{$gv}->[0]->[4], 10, 'Distance downstream' ); +is( scalar @{ $annotated_regions->[0]->[-1]->{$gv}->[0]->[5] }, + 1, '1 transcript' ); +is( $annotated_regions->[0]->[-1]->{$gv}->[0]->[5]->[0]->[0], + 'ENSDART00000133571', 'Transcript stable id' ); +is( $annotated_regions->[0]->[-1]->{$gv}->[0]->[5]->[0]->[1], + 'protein_coding', 'Transcript biotype' ); +is( scalar keys %{ $annotated_regions->[1]->[-1] }, 1, '1 genebuild' ); +is( scalar @{ $annotated_regions->[1]->[-1]->{$gv} }, 1, '1 gene' ); +is( $annotated_regions->[1]->[-1]->{$gv}->[0]->[0], + 'ENSDARG00000095747', 'Stable id' ); +is( $annotated_regions->[1]->[-1]->{$gv}->[0]->[1], + 'g300:-1', q{3' end as name} ); +is( $annotated_regions->[1]->[-1]->{$gv}->[0]->[2], undef, 'Description' ); +is( $annotated_regions->[1]->[-1]->{$gv}->[0]->[3], + 'protein_coding', 'Biotype' ); +is( $annotated_regions->[1]->[-1]->{$gv}->[0]->[4], 10, 'Distance downstream' ); +is( scalar @{ $annotated_regions->[1]->[-1]->{$gv}->[0]->[5] }, + 1, '1 transcript' ); +is( $annotated_regions->[1]->[-1]->{$gv}->[0]->[5]->[0]->[0], + 'ENSDART00000133571', 'Transcript stable id' ); +is( $annotated_regions->[1]->[-1]->{$gv}->[0]->[5]->[0]->[1], + 'protein_coding', 'Transcript biotype' ); +is( scalar keys %{ $annotated_regions->[2]->[-1] }, 1, '1 genebuild' ); +is( scalar @{ $annotated_regions->[2]->[-1]->{$gv} }, 1, '1 gene' ); +is( $annotated_regions->[2]->[-1]->{$gv}->[0]->[0], + 'ENSDARG00000095747', 'Stable id' ); +is( $annotated_regions->[2]->[-1]->{$gv}->[0]->[1], + 'g100:1', q{3' end as name} ); +is( $annotated_regions->[2]->[-1]->{$gv}->[0]->[2], undef, 'Description' ); +is( $annotated_regions->[2]->[-1]->{$gv}->[0]->[3], + 'protein_coding', 'Biotype' ); +is( $annotated_regions->[2]->[-1]->{$gv}->[0]->[4], 0, 'Distance downstream' ); +is( scalar @{ $annotated_regions->[2]->[-1]->{$gv}->[0]->[5] }, + 1, '1 transcript' ); +is( $annotated_regions->[2]->[-1]->{$gv}->[0]->[5]->[0]->[0], + 'ENSDART00000133571', 'Transcript stable id' ); +is( $annotated_regions->[2]->[-1]->{$gv}->[0]->[5]->[0]->[1], + 'protein_coding', 'Transcript biotype' ); +is( scalar keys %{ $annotated_regions->[3]->[-1] }, 1, '1 genebuild' ); +is( scalar @{ $annotated_regions->[3]->[-1]->{$gv} }, 1, '1 gene' ); +is( $annotated_regions->[3]->[-1]->{$gv}->[0]->[0], + 'ENSDARG00000095747', 'Stable id' ); +is( $annotated_regions->[3]->[-1]->{$gv}->[0]->[1], + 'g300:-1', q{3' end as name} ); +is( $annotated_regions->[3]->[-1]->{$gv}->[0]->[2], undef, 'Description' ); +is( $annotated_regions->[3]->[-1]->{$gv}->[0]->[3], + 'protein_coding', 'Biotype' ); +is( $annotated_regions->[3]->[-1]->{$gv}->[0]->[4], 0, 'Distance downstream' ); +is( scalar @{ $annotated_regions->[3]->[-1]->{$gv}->[0]->[5] }, + 1, '1 transcript' ); +is( $annotated_regions->[3]->[-1]->{$gv}->[0]->[5]->[0]->[0], + 'ENSDART00000133571', 'Transcript stable id' ); +is( $annotated_regions->[3]->[-1]->{$gv}->[0]->[5]->[0]->[1], + 'protein_coding', 'Transcript biotype' ); + +# Mock genes all on one strand +@genes = (); +@transcript_three_prime_ends = + ( [ 100, 1 ], [ 200, 1 ], [ 300, 1 ], [ 400, 1 ], ); +foreach my $transcript_three_prime_end (@transcript_three_prime_ends) { + my ( $pos, $strand ) = @{$transcript_three_prime_end}; + + # Create genes named by 3' end position + my $gene = Test::MockObject->new(); + $gene->set_always( 'stable_id', 'ENSDARG00000095747' ); + $gene->set_always( 'external_name', q{g} . $pos . q{:} . $strand ); + $gene->set_always( 'description', undef ); + $gene->set_always( 'biotype', 'protein_coding' ); + $gene->set_always( 'seq_region_start', 1 ); + $gene->set_always( 'seq_region_end', $pos ); + $gene->set_always( 'seq_region_strand', $strand ); + my $transcript = Test::MockObject->new(); + $transcript->set_always( 'stable_id', 'ENSDART00000133571' ); + $transcript->set_always( 'external_name', 'cxc64-001' ); + $transcript->set_always( 'description', undef ); + $transcript->set_always( 'biotype', 'protein_coding' ); + $transcript->set_always( 'seq_region_start', $pos - 50 ); + $transcript->set_always( 'seq_region_end', $pos ); + $transcript->set_always( 'seq_region_strand', $strand ); + $gene->set_always( 'get_all_Transcripts', [$transcript] ); + push @genes, $gene; +} + +# Mock slice +$slice = Test::MockObject->new(); +$slice->set_always( 'get_all_Genes', \@genes ); + +# Mock slice adaptor +$slice_adaptor = Test::MockObject->new(); +$slice_adaptor->set_isa('Bio::EnsEMBL::DBSQL::SliceAdaptor'); +$slice_adaptor->set_always( 'fetch_by_region', $slice ); + +$gene_finder = DETCT::GeneFinder->new( { slice_adaptor => $slice_adaptor, } ); + +isa_ok( $gene_finder, 'DETCT::GeneFinder' ); + +# Adding gene annotation with genes only on one strand +$regions = [ + [ '1', 1, 1000, 10, -10, '1', 110, 1, 10, [], undef, undef, [], [] ], + [ '1', 1, 1000, 10, -10, '1', 290, -1, 10, [], undef, undef, [], [] ], +]; +my $annotated_regions = $gene_finder->add_gene_annotation($regions); +my ($gv) = keys %{ $annotated_regions->[0]->[-1] }; # Genebuild version varies +is( scalar keys %{ $annotated_regions->[0]->[-1] }, 1, '1 genebuild' ); +is( scalar @{ $annotated_regions->[0]->[-1]->{$gv} }, 1, '1 gene' ); +is( $annotated_regions->[0]->[-1]->{$gv}->[0]->[0], + 'ENSDARG00000095747', 'Stable id' ); +is( $annotated_regions->[0]->[-1]->{$gv}->[0]->[1], + 'g100:1', q{3' end as name} ); +is( $annotated_regions->[0]->[-1]->{$gv}->[0]->[2], undef, 'Description' ); +is( $annotated_regions->[0]->[-1]->{$gv}->[0]->[3], + 'protein_coding', 'Biotype' ); +is( $annotated_regions->[0]->[-1]->{$gv}->[0]->[4], 10, 'Distance downstream' ); +is( scalar @{ $annotated_regions->[0]->[-1]->{$gv}->[0]->[5] }, + 1, '1 transcript' ); +is( $annotated_regions->[0]->[-1]->{$gv}->[0]->[5]->[0]->[0], + 'ENSDART00000133571', 'Transcript stable id' ); +is( $annotated_regions->[0]->[-1]->{$gv}->[0]->[5]->[0]->[1], + 'protein_coding', 'Transcript biotype' ); +is( scalar keys %{ $annotated_regions->[1]->[-1] }, + 0, 'No genes on reverse strand' ); diff --git a/t/misc-bam.t b/t/misc-bam.t new file mode 100644 index 0000000..86f41bf --- /dev/null +++ b/t/misc-bam.t @@ -0,0 +1,1652 @@ +use Test::More; +use Test::Exception; +use Test::Warn; +use Test::DatabaseRow; +use Test::MockObject; +use Carp; + +plan tests => 323; + +use DETCT::Misc::BAM qw( + get_reference_sequence_lengths + get_sequence + count_tags + bin_reads + get_read_peaks + get_three_prime_ends + merge_three_prime_ends + filter_three_prime_ends + choose_three_prime_end + count_reads + merge_read_counts +); + +=for comment + +Test random BAM files can be regenerated using: + +perl script/make_test_sam.pl --seed 10 --seq_region_count 5 \ +--seq_region_max_length 10_000 --read_pair_count 100 \ +--read_tags NNNNBGAGGC NNNNBAGAAG | samtools view -bS - | samtools sort - test1 +perl script/make_test_sam.pl --seed 10 --seq_region_count 5 \ +--seq_region_max_length 10_000 --read_pair_count 100 \ +--read_tags NNNNBCAGAG NNNNBGCACG | samtools view -bS - | samtools sort - test2 +perl script/make_test_sam.pl --seed 20 --seq_region_count 5 \ +--seq_region_max_length 10_000 --read_pair_count 100 \ +--read_tags NNNNBCGCAA NNNNBCAAGA | samtools view -bS - | samtools sort - test3 +ls *.bam | xargs -n1 samtools index +mv test* t/data/ + +Some numbers in tests below will then need updating. Code to generate numbers +(using independent methods) is given before each test. + +Test random FASTA files can be regenerated using: + +perl script/make_test_fasta.pl --seed 10 --seq_region_count 5 \ +--seq_region_max_length 10_000 > test12.fa +perl script/make_test_fasta.pl --seed 20 --seq_region_count 5 \ +--seq_region_max_length 10_000 > test3.fa +ls *.fa | xargs -n1 samtools faidx +mv test* t/data/ + +=cut + +# Check reference sequence length returned by test BAM file +throws_ok { get_reference_sequence_lengths() } qr/No BAM file specified/ms, + 'No BAM file'; +my %bam_length = get_reference_sequence_lengths('t/data/test1.bam'); +is( $bam_length{1}, 8789, 'Chr 1 length' ); +is( $bam_length{2}, 7958, 'Chr 2 length' ); +is( $bam_length{3}, 4808, 'Chr 3 length' ); + +# Check getting sequence from test FASTA file +# First 10 bp of chromosome 1 should be CCAGGCGCGG according to: + +=for comment +head -2 t/data/test12.fa +=cut + +throws_ok { + get_sequence( + { + seq_name => '1', + start => 1, + end => 10, + strand => 1, + } + ); +} +qr/No FASTA index or FASTA file specified/ms, 'No FASTA index or file'; +throws_ok { + get_sequence( + { + ref_fasta => 't/data/test12.fa', + start => 1, + end => 10, + strand => 1, + } + ); +} +qr/No sequence name specified/ms, 'No sequence name'; +throws_ok { + get_sequence( + { + ref_fasta => 't/data/test12.fa', + seq_name => '1', + end => 10, + strand => 1, + } + ); +} +qr/No sequence start specified/ms, 'No sequence start'; +throws_ok { + get_sequence( + { + ref_fasta => 't/data/test12.fa', + seq_name => '1', + start => 1, + strand => 1, + } + ); +} +qr/No sequence end specified/ms, 'No sequence end'; +throws_ok { + get_sequence( + { + ref_fasta => 't/data/test12.fa', + seq_name => '1', + start => 1, + end => 10, + } + ); +} +qr/No sequence strand specified/ms, 'No sequence strand'; +my $seq; +$seq = get_sequence( + { + ref_fasta => 't/data/test12.fa', + seq_name => '1', + start => 1, + end => 10, + strand => 1, + } +); +is( length $seq, 10, 'Subsequence length' ); +is( $seq, 'CCAGGCGCGG', 'Subsequence' ); +$seq = get_sequence( + { + ref_fasta => 't/data/test12.fa', + seq_name => '1', + start => 1, + end => 10, + strand => -1, + } +); +is( length $seq, 10, 'Reverse complement subsequence length' ); +is( $seq, 'CCGCGCCTGG', 'Reverse complement subsequence' ); + +# Check counting tags required parameters +throws_ok { + count_tags( + { + mismatch_threshold => 2, + seq_name => '1', + tags => ['NNNNBGAGGC'], + } + ); +} +qr/No BAM file specified/ms, 'No BAM file'; +throws_ok { + count_tags( + { + bam_file => 't/data/test1.bam', + seq_name => '1', + tags => ['NNNNBGAGGC'], + } + ); +} +qr/No mismatch threshold specified/ms, 'No mismatch threshold'; +throws_ok { + count_tags( + { + bam_file => 't/data/test1.bam', + mismatch_threshold => 2, + tags => ['NNNNBGAGGC'], + } + ); +} +qr/No sequence name specified/ms, 'No sequence name'; +throws_ok { + count_tags( + { + bam_file => 't/data/test1.bam', + mismatch_threshold => 2, + seq_name => '1', + } + ); +} +qr/No tags specified/ms, 'No tags'; + +my $count; + +# Check tag counts returned by chromosome 1 of test BAM file +# Should be 50 random tags according to: + +=for comment +samtools view -f 128 -F 1028 t/data/test1.bam 1 | awk '{ print $1 }' \ +| sed -e 's/.*#//' | grep GAGGC$ | sort -u | wc -l +=cut + +$count = count_tags( + { + bam_file => 't/data/test1.bam', + mismatch_threshold => 100, + seq_name => '1', + tags => ['NNNNBGAGGC'], + } +); +is( scalar keys %{$count}, 1, '1 tag' ); +is( scalar keys %{ $count->{NNNNBGAGGC} }, 50, '50 random tags' ); + +# Check tag counts returned in 1000 bp onwards of chromosome 1 of test BAM file +# Should be 45 random tags according to: + +=for comment +samtools view -f 128 -F 1028 t/data/test1.bam 1:1000 | awk '{ print $1 }' \ +| sed -e 's/.*#//' | grep GAGGC$ | sort -u | wc -l +=cut + +$count = count_tags( + { + bam_file => 't/data/test1.bam', + mismatch_threshold => 100, + seq_name => '1', + start => 1000, + tags => ['NNNNBGAGGC'], + } +); +is( scalar keys %{$count}, 1, '1 tag' ); +is( scalar keys %{ $count->{NNNNBGAGGC} }, 45, '45 random tags' ); + +# Check tag counts returned in first 1000 bp of chromosome 1 of test BAM file +# Should be 6 random tags according to: + +=for comment +samtools view -f 128 -F 1028 t/data/test1.bam 1:1-1000 | awk '{ print $1 }' \ +| sed -e 's/.*#//' | grep GAGGC$ | sort -u | wc -l +=cut + +$count = count_tags( + { + bam_file => 't/data/test1.bam', + mismatch_threshold => 100, + seq_name => '1', + start => 1, + end => 1000, + tags => ['NNNNBGAGGC'], + } +); +is( scalar keys %{$count}, 1, '1 tag' ); +is( scalar keys %{ $count->{NNNNBGAGGC} }, 6, '6 random tags' ); + +# Check tag counts returned with low mismatch threshold +# Should be 13 random tags according to: + +=for comment +samtools view -f 128 -F 1028 t/data/test1.bam 1 | grep NM:i:0 \ +| awk '{ if ($6 == "54M") print $1 }' \ +| sed -e 's/.*#//' | grep GAGGC$ | sort -u | wc -l +=cut + +$count = count_tags( + { + bam_file => 't/data/test1.bam', + mismatch_threshold => 0, + seq_name => '1', + tags => ['NNNNBGAGGC'], + } +); +is( scalar keys %{$count}, 1, '1 tag' ); +is( scalar keys %{ $count->{NNNNBGAGGC} }, 13, '13 random tags' ); + +# Check binning reads required parameters +throws_ok { + bin_reads( + { + mismatch_threshold => 2, + bin_size => 100, + seq_name => '1', + tags => ['NNNNBGAGGC'], + } + ); +} +qr/No BAM file specified/ms, 'No BAM file'; +throws_ok { + bin_reads( + { + bam_file => 't/data/test1.bam', + bin_size => 100, + seq_name => '1', + tags => ['NNNNBGAGGC'], + } + ); +} +qr/No mismatch threshold specified/ms, 'No mismatch threshold'; +throws_ok { + bin_reads( + { + bam_file => 't/data/test1.bam', + mismatch_threshold => 2, + seq_name => '1', + tags => ['NNNNBGAGGC'], + } + ); +} +qr/No bin size specified/ms, 'No bin size'; +throws_ok { + bin_reads( + { + bam_file => 't/data/test1.bam', + mismatch_threshold => 2, + bin_size => 100, + tags => ['NNNNBGAGGC'], + } + ); +} +qr/No sequence name specified/ms, 'No sequence name'; +throws_ok { + bin_reads( + { + bam_file => 't/data/test1.bam', + mismatch_threshold => 2, + bin_size => 100, + seq_name => '1', + } + ); +} +qr/No tags specified/ms, 'No tags'; + +# Check read bins returned by test BAM file +# Should be 35 bins according to: + +=for comment +(samtools view -f 16 -F 1028 t/data/test1.bam 2 | grep 54M | grep NM:i:0 \ +| awk '{ print ($4) / 100 "\t" ($4 + 53 - 1) / 100 }'; \ +samtools view -f 32 -F 1028 t/data/test1.bam 2 | grep 54M | grep NM:i:0 \ +| awk '{ print ($4) / 100 "\t" ($4 + 53 - 1) / 100 }') \ +| sed -e 's/\.[0-9]*//g' \ +| awk '{ if ($1 == $2) print $1; else print $1 "\n" $2 }' \ +| sort | uniq -c | wc -l +=cut + +$count = bin_reads( + { + bam_file => 't/data/test1.bam', + mismatch_threshold => 0, + bin_size => 100, + seq_name => '2', + tags => [ 'NNNNBGAGGC', 'NNNNBAGAAG' ], + } +); +is( scalar keys %{$count}, 1, '1 sequence' ); +is( scalar keys %{ $count->{'2'} }, 35, '35 bins' ); + +# Check read bins returned with non-existent tag +$count = bin_reads( + { + bam_file => 't/data/test1.bam', + mismatch_threshold => 0, + bin_size => 100, + seq_name => '1', + tags => ['NNNNTTTTTT'], + } +); +is( scalar keys %{$count}, 1, '1 sequence' ); +is( scalar keys %{ $count->{'1'} }, 0, '0 bins' ); + +# Check getting read peaks required parameters +throws_ok { + get_read_peaks( + { + mismatch_threshold => 0, + peak_buffer_width => 100, + seq_name => '1', + tags => [ 'NNNNBGAGGC', 'NNNNBAGAAG' ], + } + ); +} +qr/No BAM file specified/ms, 'No BAM file'; +throws_ok { + get_read_peaks( + { + bam_file => 't/data/test1.bam', + peak_buffer_width => 100, + seq_name => '1', + tags => [ 'NNNNBGAGGC', 'NNNNBAGAAG' ], + } + ); +} +qr/No mismatch threshold specified/ms, 'No mismatch threshold'; +throws_ok { + get_read_peaks( + { + bam_file => 't/data/test1.bam', + mismatch_threshold => 0, + seq_name => '1', + tags => [ 'NNNNBGAGGC', 'NNNNBAGAAG' ], + } + ); +} +qr/No peak buffer width specified/ms, 'No peak buffer width'; +throws_ok { + get_read_peaks( + { + bam_file => 't/data/test1.bam', + mismatch_threshold => 0, + peak_buffer_width => 100, + tags => [ 'NNNNBGAGGC', 'NNNNBAGAAG' ], + } + ); +} +qr/No sequence name specified/ms, 'No sequence name'; +throws_ok { + get_read_peaks( + { + bam_file => 't/data/test1.bam', + mismatch_threshold => 0, + peak_buffer_width => 100, + seq_name => '1', + } + ); +} +qr/No tags specified/ms, 'No tags'; + +my $peaks; + +# Check read peaks returned by test BAM file +# First peak should be 262 - 350 (2 reads) according to: + +=for comment +samtools view -f 128 -F 1028 t/data/test1.bam 2 | grep 54M | grep NM:i:0 \ +| awk '{ print $4 "\t" $4 + 53 }' | head -4 +=cut + +# Last peak should be 7399 - 7452 (1 read) according to: + +=for comment +samtools view -f 128 -F 1028 t/data/test1.bam 2 | grep 54M | grep NM:i:0 \ +| awk '{ print $4 "\t" $4 + 53 }' | tail -4 +=cut + +$peaks = get_read_peaks( + { + bam_file => 't/data/test1.bam', + mismatch_threshold => 0, + peak_buffer_width => 100, + seq_name => '2', + tags => [ 'NNNNBGAGGC', 'NNNNBAGAAG' ], + } +); +is( scalar keys %{$peaks}, 1, '1 sequence' ); +is( $peaks->{'2'}->[0]->[0], 262, 'Start of first peak' ); +is( $peaks->{'2'}->[0]->[1], 350, 'End of first peak' ); +is( $peaks->{'2'}->[0]->[2], 2, 'First peak read count' ); +is( $peaks->{'2'}->[-1]->[0], 7399, 'Start of last peak' ); +is( $peaks->{'2'}->[-1]->[1], 7452, 'End of last peak' ); +is( $peaks->{'2'}->[-1]->[2], 1, 'Last peak read count' ); + +# Check read peaks returned by test BAM file +# First peak should be 78 - 131 (1 read) according to: + +=for comment +samtools view -f 128 -F 1028 t/data/test2.bam 1 | grep 54M | grep NM:i:0 \ +| awk '{ print $4 "\t" $4 + 53 }' | head -4 +=cut + +# Last peak should be 8666 - 8719 (1 read) according to: + +=for comment +samtools view -f 128 -F 1028 t/data/test2.bam 1 | grep 54M | grep NM:i:0 \ +| awk '{ print $4 "\t" $4 + 53 }' | tail -4 +=cut + +$peaks = get_read_peaks( + { + bam_file => 't/data/test2.bam', + mismatch_threshold => 0, + peak_buffer_width => 100, + seq_name => '1', + tags => [ 'NNNNBCAGAG', 'NNNNBGCACG' ], + } +); +is( scalar keys %{$peaks}, 1, '1 sequence' ); +is( $peaks->{'1'}->[0]->[0], 78, 'Start of first peak' ); +is( $peaks->{'1'}->[0]->[1], 131, 'End of first peak' ); +is( $peaks->{'1'}->[0]->[2], 1, 'First peak read count' ); +is( $peaks->{'1'}->[-1]->[0], 8666, 'Start of last peak' ); +is( $peaks->{'1'}->[-1]->[1], 8719, 'End of last peak' ); +is( $peaks->{'1'}->[-1]->[2], 1, 'Last peak read count' ); + +# Check read peaks returned with non-existent tag +$peaks = get_read_peaks( + { + bam_file => 't/data/test1.bam', + mismatch_threshold => 0, + peak_buffer_width => 100, + seq_name => '1', + tags => ['NNNNTTTTTT'], + } +); +is( scalar keys %{$peaks}, 1, '1 sequence' ); +is( scalar @{ $peaks->{'1'} }, 0, '0 peaks' ); + +# Check getting 3' ends required parameters +throws_ok { + get_three_prime_ends( + { + mismatch_threshold => 0, + seq_name => '1', + tags => [ 'NNNNBGAGGC', 'NNNNBAGAAG' ], + regions => [ [ 1, 1000, 10, -10 ] ], + } + ); +} +qr/No BAM file specified/ms, 'No BAM file'; +throws_ok { + get_three_prime_ends( + { + bam_file => 't/data/test1.bam', + seq_name => '1', + tags => [ 'NNNNBGAGGC', 'NNNNBAGAAG' ], + regions => [ [ 1, 1000, 10, -10 ] ], + } + ); +} +qr/No mismatch threshold specified/ms, 'No mismatch threshold'; +throws_ok { + get_three_prime_ends( + { + bam_file => 't/data/test1.bam', + mismatch_threshold => 0, + tags => [ 'NNNNBGAGGC', 'NNNNBAGAAG' ], + regions => [ [ 1, 1000, 10, -10 ] ], + } + ); +} +qr/No sequence name specified/ms, 'No sequence name'; +throws_ok { + get_three_prime_ends( + { + bam_file => 't/data/test1.bam', + mismatch_threshold => 0, + seq_name => '1', + regions => [ [ 1, 1000, 10, -10 ] ], + } + ); +} +qr/No tags specified/ms, 'No tags'; +throws_ok { + get_three_prime_ends( + { + bam_file => 't/data/test1.bam', + mismatch_threshold => 0, + seq_name => '1', + tags => [ 'NNNNBGAGGC', 'NNNNBAGAAG' ], + } + ); +} +qr/No regions specified/ms, 'No regions'; + +my $three_prime_ends; + +# Check 3' ends returned in first 2000 bp of chromosome 1 of test BAM file +# Should be 9 3' ends according to: + +=for comment +(samtools view -f 160 -F 1036 t/data/test1.bam 1:1-2000 \ +| grep NM:i:0 | grep 54M | awk '{ print "1:" $8 + 29 ":1" }'; \ +samtools view -f 128 -F 1068 t/data/test1.bam 1:1-2000 \ +| grep NM:i:0 | grep 54M | awk '{ print "1:" $8 ":-1" }') \ +| wc -l +=cut + +# One forward strand 3' end should be 1:2642:1 with 1 read according to: + +=for comment +samtools view -f 160 -F 1036 t/data/test1.bam 1:1-2000 \ +| grep NM:i:0 | grep 54M | awk '{ print "1:" $8 + 29 ":1" }' | sort | uniq -c +=cut + +# One reverse strand 3' end should be 1:632:-1 with 1 read according to: + +=for comment +samtools view -f 128 -F 1068 t/data/test1.bam 1:1-2000 \ +| grep NM:i:0 | grep 54M | awk '{ print "1:" $8 ":-1" }' | sort | uniq -c +=cut + +$three_prime_ends = get_three_prime_ends( + { + bam_file => 't/data/test1.bam', + mismatch_threshold => 0, + seq_name => '1', + tags => [ 'NNNNBGAGGC', 'NNNNBAGAAG' ], + regions => [ [ 1, 2000, 10, -10 ] ], + } +); +is( scalar keys %{$three_prime_ends}, 1, '1 sequence' ); +is( scalar @{ $three_prime_ends->{'1'} }, 1, '1 region' ); +is( scalar @{ $three_prime_ends->{'1'}->[0]->[4] }, 9, q{9 3' ends} ); +my $got_forward = 0; +my $got_reverse = 0; +foreach my $three_prime_end ( @{ $three_prime_ends->{'1'}->[0]->[4] } ) { + my ( $seq, $pos, $strand, $read_count ) = @{$three_prime_end}; + my $string_form = join q{:}, $seq, $pos, $strand; + if ( $string_form eq '1:2642:1' ) { + $got_forward = 1; + } + if ( $string_form eq '1:632:-1' ) { + $got_reverse = 1; + } +} +ok( $got_forward, q{1 forward strand 3' end} ); +ok( $got_reverse, q{1 reverse strand 3' end} ); + +# Get 3' ends returned with non-existent tag +$three_prime_ends = get_three_prime_ends( + { + bam_file => 't/data/test1.bam', + mismatch_threshold => 0, + seq_name => '1', + tags => ['NNNNTTTTTT'], + regions => [ [ 1, 2000, 10, -10 ] ], + } +); +is( scalar keys %{$three_prime_ends}, 1, '1 sequence' ); +is( scalar @{ $three_prime_ends->{'1'} }, 1, '1 region' ); +is( scalar @{ $three_prime_ends->{'1'}->[0]->[4] }, 0, q{0 3' ends} ); + +# Get 3' ends for sequence name with a peak +$three_prime_ends = get_three_prime_ends( + { + bam_file => 't/data/test1.bam', + mismatch_threshold => 0, + seq_name => '3', + tags => [ 'NNNNBGAGGC', 'NNNNBAGAAG' ], + regions => [ [ 1, 10000, 10, -10 ] ], + } +); +is( scalar keys %{$three_prime_ends}, 1, '1 sequence' ); +is( scalar @{ $three_prime_ends->{'3'} }, 1, '1 region' ); +my $max_read_count = 0; +foreach my $three_prime_end ( @{ $three_prime_ends->{'3'}->[0]->[4] } ) { + my ( $seq, $pos, $strand, $read_count ) = @{$three_prime_end}; + if ( $read_count > $max_read_count ) { + $max_read_count = $read_count; + } +} +ok( $max_read_count > 1, q{Read count for 3' end of peak} ); + +# Check merging 3' ends required parameters +throws_ok { + merge_three_prime_ends( { regions => [ [ [ 1, 1000, 10, -10, [] ] ] ], } ); +} +qr/No sequence name specified/ms, 'No sequence name'; +throws_ok { + merge_three_prime_ends( { seq_name => '1', } ); +} +qr/No regions specified/ms, 'No regions'; + +# Test lists with different number of regions +throws_ok { + merge_three_prime_ends( + { + seq_name => '1', + regions => [ + [ [ 1, 1000, 10, -10, [] ], ], + [ [ 1, 1000, 10, -10, [] ], [ 2000, 3000, 10, -10, [] ], ], + ], + } + ); +} +qr/Number of regions does not match in all lists/ms, + 'Different number of regions'; + +# Test lists with different regions +throws_ok { + merge_three_prime_ends( + { + seq_name => '1', + regions => [ + [ [ 1, 1000, 10, -10, [] ], [ 2000, 4000, 10, -10, [] ], ], + [ [ 1, 1000, 10, -10, [] ], [ 3000, 4000, 10, -10, [] ], ], + ], + } + ); +} +qr/Regions not in the same order or not the same in each list/ms, + 'Different region start'; +throws_ok { + merge_three_prime_ends( + { + seq_name => '1', + regions => [ + [ [ 1, 1000, 10, -10, [] ], [ 2000, 4000, 10, -10, [] ], ], + [ [ 1, 1000, 10, -10, [] ], [ 2000, 5000, 10, -10, [] ], ], + ], + } + ); +} +qr/Regions not in the same order or not the same in each list/ms, + 'Different region end'; +throws_ok { + merge_three_prime_ends( + { + seq_name => '1', + regions => [ + [ [ 1, 1000, 10, -10, [] ], [ 2000, 4000, 10, -10, [] ], ], + [ [ 1, 1000, 10, -10, [] ], [ 2000, 4000, 20, -10, [] ], ], + ], + } + ); +} +qr/Regions not in the same order or not the same in each list/ms, + 'Different region maximum read count'; +throws_ok { + merge_three_prime_ends( + { + seq_name => '1', + regions => [ + [ [ 1, 1000, 10, -10, [] ], [ 2000, 4000, 10, -10, [] ], ], + [ [ 1, 1000, 10, -10, [] ], [ 2000, 4000, 10, -20, [] ], ], + ], + } + ); +} +qr/Regions not in the same order or not the same in each list/ms, + 'Different region log probability sum'; + +# Test one list of regions +$three_prime_ends = merge_three_prime_ends( + { + seq_name => '1', + regions => [ [ [ 1, 1000, 10, -10, [] ] ] ], + } +); +is( scalar keys %{$three_prime_ends}, 1, '1 sequence' ); +is( scalar @{ $three_prime_ends->{'1'} }, 1, '1 region' ); +is( $three_prime_ends->{'1'}->[0]->[0], 1, 'Region start' ); +is( $three_prime_ends->{'1'}->[0]->[1], 1000, 'Region end' ); +is( $three_prime_ends->{'1'}->[0]->[2], 10, 'Region maximum read count' ); +is( $three_prime_ends->{'1'}->[0]->[3], -10, 'Region log probability sum' ); +is( @{ $three_prime_ends->{'1'}->[0]->[4] }, 0, q{No 3' ends} ); + +# Test two lists of regions +$three_prime_ends = merge_three_prime_ends( + { + seq_name => '1', + regions => [ + [ + [ + 1, 1000, 10, -10, + [ [ '1', 2000, 1, 10 ], [ '1', 3000, 1, 10 ], ] + ] + ], + [ [ 1, 1000, 10, -10, [ [ '1', 3000, 1, 10 ], ] ] ], + ], + } +); +is( scalar keys %{$three_prime_ends}, 1, '1 sequence' ); +is( scalar @{ $three_prime_ends->{'1'} }, 1, '1 region' ); +is( $three_prime_ends->{'1'}->[0]->[0], 1, 'Region start' ); +is( $three_prime_ends->{'1'}->[0]->[1], 1000, 'Region end' ); +is( $three_prime_ends->{'1'}->[0]->[2], 10, 'Region maximum read count' ); +is( $three_prime_ends->{'1'}->[0]->[3], -10, 'Region log probability sum' ); +is( @{ $three_prime_ends->{'1'}->[0]->[4] }, 2, q{2 3' ends} ); +is( $three_prime_ends->{'1'}->[0]->[4]->[0]->[0], '1', q{3' end sequence} ); +is( $three_prime_ends->{'1'}->[0]->[4]->[0]->[1], 3000, q{3' end position} ); +is( $three_prime_ends->{'1'}->[0]->[4]->[0]->[2], 1, q{3' end strand} ); +is( $three_prime_ends->{'1'}->[0]->[4]->[0]->[3], 20, q{3' end read count} ); + +# Test different strands +$three_prime_ends = merge_three_prime_ends( + { + seq_name => '1', + regions => [ + [ [ 1, 1000, 10, -10, [ [ '1', 2000, 1, 10 ], ] ] ], + [ [ 1, 1000, 10, -10, [ [ '1', 2000, -1, 10 ], ] ] ], + ], + } +); +is( scalar keys %{$three_prime_ends}, 1, '1 sequence' ); +is( scalar @{ $three_prime_ends->{'1'} }, 1, '1 region' ); +is( @{ $three_prime_ends->{'1'}->[0]->[4] }, 2, q{2 3' ends} ); + +my $analysis; + +# Mock analysis object returning non-polyA +$analysis = Test::MockObject->new(); +$analysis->set_isa('DETCT::Analysis'); +$analysis->set_always( 'get_subsequence', 'TTTTTTTTTT' ); + +# Check filtering 3' ends required parameters +throws_ok { + filter_three_prime_ends( + { + seq_name => '1', + regions => [ [ [ 1, 1000, 10, -10, [] ] ] ], + } + ); +} +qr/No analysis specified/ms, 'No analysis'; +throws_ok { + filter_three_prime_ends( + { + analysis => $analysis, + regions => [ [ [ 1, 1000, 10, -10, [] ] ] ], + } + ); +} +qr/No sequence name specified/ms, 'No sequence name'; +throws_ok { + filter_three_prime_ends( + { + analysis => $analysis, + seq_name => '1', + } + ); +} +qr/No regions specified/ms, 'No regions'; + +# Test filtering 3' ends +$three_prime_ends = filter_three_prime_ends( + { + analysis => $analysis, + seq_name => '1', + regions => [ + [ + 1, 1000, 10, -10, + [ + [ '1', 1000, 1, 20 ], + [ '1', 2000, -1, 10 ], + [ '1', 3000, 1, 1 ], + [ '1', 4000, 1, 3 ], + ] + ] + ], + } +); +is( scalar keys %{$three_prime_ends}, 1, '1 sequence' ); +is( scalar @{ $three_prime_ends->{'1'} }, 1, '1 region' ); +is( $three_prime_ends->{'1'}->[0]->[0], 1, 'Region start' ); +is( $three_prime_ends->{'1'}->[0]->[1], 1000, 'Region end' ); +is( $three_prime_ends->{'1'}->[0]->[2], 10, 'Region maximum read count' ); +is( $three_prime_ends->{'1'}->[0]->[3], -10, 'Region log probability sum' ); +is( @{ $three_prime_ends->{'1'}->[0]->[4] }, 2, q{2 3' ends} ); +is( $three_prime_ends->{'1'}->[0]->[4]->[0]->[0], '1', q{3' end sequence} ); +is( $three_prime_ends->{'1'}->[0]->[4]->[0]->[1], 1000, q{3' end position} ); +is( $three_prime_ends->{'1'}->[0]->[4]->[0]->[2], 1, q{3' end strand} ); +is( $three_prime_ends->{'1'}->[0]->[4]->[0]->[3], 20, q{3' end read count} ); + +# Mock analysis object returning polyA +$analysis = Test::MockObject->new(); +$analysis->set_isa('DETCT::Analysis'); +$analysis->set_always( 'get_subsequence', 'AAAATTTTTT' ); + +# Test filtering 3' ends +$three_prime_ends = filter_three_prime_ends( + { + analysis => $analysis, + seq_name => '1', + regions => [ + [ + 1, 1000, 10, -10, + [ + [ '1', 1000, 1, 20 ], + [ '1', 2000, -1, 10 ], + [ '1', 3000, 1, 1 ], + [ '1', 4000, 1, 3 ], + ] + ] + ], + } +); +is( scalar keys %{$three_prime_ends}, 1, '1 sequence' ); +is( scalar @{ $three_prime_ends->{'1'} }, 1, '1 region' ); +is( $three_prime_ends->{'1'}->[0]->[0], 1, 'Region start' ); +is( $three_prime_ends->{'1'}->[0]->[1], 1000, 'Region end' ); +is( $three_prime_ends->{'1'}->[0]->[2], 10, 'Region maximum read count' ); +is( $three_prime_ends->{'1'}->[0]->[3], -10, 'Region log probability sum' ); +is( @{ $three_prime_ends->{'1'}->[0]->[4] }, 0, q{0 3' ends} ); + +# Check choosing 3' end required parameters +throws_ok { + choose_three_prime_end( { regions => [ [ [ 1, 1000, 10, -10, [] ] ] ], } ); +} +qr/No sequence name specified/ms, 'No sequence name'; +throws_ok { + choose_three_prime_end( { seq_name => '1', } ); +} +qr/No regions specified/ms, 'No regions'; + +# Test choosing 3' end +$three_prime_ends = choose_three_prime_end( + { + seq_name => '1', + regions => [ + [ + 1, 1000, 10, -10, + [ + [ '1', 1000, 1, 20 ], + [ '1', 2000, -1, 10 ], + [ '1', 3000, 1, 1 ], + [ '1', 4000, 1, 3 ], + ] + ] + ], + } +); +is( scalar keys %{$three_prime_ends}, 1, '1 sequence' ); +is( scalar @{ $three_prime_ends->{'1'} }, 1, '1 region' ); +is( $three_prime_ends->{'1'}->[0]->[0], 1, 'Region start' ); +is( $three_prime_ends->{'1'}->[0]->[1], 1000, 'Region end' ); +is( $three_prime_ends->{'1'}->[0]->[2], 10, 'Region maximum read count' ); +is( $three_prime_ends->{'1'}->[0]->[3], -10, 'Region log probability sum' ); +is( $three_prime_ends->{'1'}->[0]->[4], '1', q{3' end sequence} ); +is( $three_prime_ends->{'1'}->[0]->[5], 1000, q{3' end position} ); +is( $three_prime_ends->{'1'}->[0]->[6], 1, q{3' end strand} ); +is( $three_prime_ends->{'1'}->[0]->[7], 20, q{3' end read count} ); + +# Test choosing 3' end with no 3' ends +$three_prime_ends = choose_three_prime_end( + { + seq_name => '1', + regions => [ [ 1, 1000, 10, -10, [] ] ], + } +); +is( scalar keys %{$three_prime_ends}, 1, '1 sequence' ); +is( scalar @{ $three_prime_ends->{'1'} }, 1, '1 region' ); +is( $three_prime_ends->{'1'}->[0]->[0], 1, 'Region start' ); +is( $three_prime_ends->{'1'}->[0]->[1], 1000, 'Region end' ); +is( $three_prime_ends->{'1'}->[0]->[2], 10, 'Region maximum read count' ); +is( $three_prime_ends->{'1'}->[0]->[3], -10, 'Region log probability sum' ); +is( $three_prime_ends->{'1'}->[0]->[4], undef, q{3' end sequence} ); +is( $three_prime_ends->{'1'}->[0]->[5], undef, q{3' end position} ); +is( $three_prime_ends->{'1'}->[0]->[6], undef, q{3' end strand} ); +is( $three_prime_ends->{'1'}->[0]->[7], undef, q{3' end read count} ); + +# Test choosing 3' end with reduced region end +$three_prime_ends = choose_three_prime_end( + { + seq_name => '1', + regions => [ [ 1, 1000, 10, -10, [ [ '1', 900, 1, 20 ], ] ] ], + } +); +is( scalar keys %{$three_prime_ends}, 1, '1 sequence' ); +is( scalar @{ $three_prime_ends->{'1'} }, 1, '1 region' ); +is( $three_prime_ends->{'1'}->[0]->[0], 1, 'Region start' ); +is( $three_prime_ends->{'1'}->[0]->[1], 900, 'Region end' ); +is( $three_prime_ends->{'1'}->[0]->[2], 10, 'Region maximum read count' ); +is( $three_prime_ends->{'1'}->[0]->[3], -10, 'Region log probability sum' ); +is( $three_prime_ends->{'1'}->[0]->[4], '1', q{3' end sequence} ); +is( $three_prime_ends->{'1'}->[0]->[5], 900, q{3' end position} ); +is( $three_prime_ends->{'1'}->[0]->[6], 1, q{3' end strand} ); +is( $three_prime_ends->{'1'}->[0]->[7], 20, q{3' end read count} ); + +# Test choosing 3' end with reduced region start +$three_prime_ends = choose_three_prime_end( + { + seq_name => '1', + regions => [ [ 1, 1000, 10, -10, [ [ '1', 100, -1, 20 ], ] ] ], + } +); +is( scalar keys %{$three_prime_ends}, 1, '1 sequence' ); +is( scalar @{ $three_prime_ends->{'1'} }, 1, '1 region' ); +is( $three_prime_ends->{'1'}->[0]->[0], 100, 'Region start' ); +is( $three_prime_ends->{'1'}->[0]->[1], 1000, 'Region end' ); +is( $three_prime_ends->{'1'}->[0]->[2], 10, 'Region maximum read count' ); +is( $three_prime_ends->{'1'}->[0]->[3], -10, 'Region log probability sum' ); +is( $three_prime_ends->{'1'}->[0]->[4], '1', q{3' end sequence} ); +is( $three_prime_ends->{'1'}->[0]->[5], 100, q{3' end position} ); +is( $three_prime_ends->{'1'}->[0]->[6], -1, q{3' end strand} ); +is( $three_prime_ends->{'1'}->[0]->[7], 20, q{3' end read count} ); + +# Test choosing 3' end with different sequence name +$three_prime_ends = choose_three_prime_end( + { + seq_name => '1', + regions => [ [ 1, 1000, 10, -10, [ [ '2', 100, -1, 20 ], ] ] ], + } +); +is( scalar keys %{$three_prime_ends}, 1, '1 sequence' ); +is( scalar @{ $three_prime_ends->{'1'} }, 1, '1 region' ); +is( $three_prime_ends->{'1'}->[0]->[0], 1, 'Region start' ); +is( $three_prime_ends->{'1'}->[0]->[1], 1000, 'Region end' ); +is( $three_prime_ends->{'1'}->[0]->[2], 10, 'Region maximum read count' ); +is( $three_prime_ends->{'1'}->[0]->[3], -10, 'Region log probability sum' ); +is( $three_prime_ends->{'1'}->[0]->[4], '2', q{3' end sequence} ); +is( $three_prime_ends->{'1'}->[0]->[5], 100, q{3' end position} ); +is( $three_prime_ends->{'1'}->[0]->[6], -1, q{3' end strand} ); +is( $three_prime_ends->{'1'}->[0]->[7], 20, q{3' end read count} ); + +# Test choosing 3' end beyond region start +$three_prime_ends = choose_three_prime_end( + { + seq_name => '1', + regions => [ [ 1000, 2000, 10, -10, [ [ '1', 900, 1, 20 ], ] ] ], + } +); +is( scalar keys %{$three_prime_ends}, 1, '1 sequence' ); +is( scalar @{ $three_prime_ends->{'1'} }, 1, '1 region' ); +is( $three_prime_ends->{'1'}->[0]->[0], 1000, 'Region start' ); +is( $three_prime_ends->{'1'}->[0]->[1], 2000, 'Region end' ); +is( $three_prime_ends->{'1'}->[0]->[2], 10, 'Region maximum read count' ); +is( $three_prime_ends->{'1'}->[0]->[3], -10, 'Region log probability sum' ); +is( $three_prime_ends->{'1'}->[0]->[4], '1', q{3' end sequence} ); +is( $three_prime_ends->{'1'}->[0]->[5], 900, q{3' end position} ); +is( $three_prime_ends->{'1'}->[0]->[6], 1, q{3' end strand} ); +is( $three_prime_ends->{'1'}->[0]->[7], 20, q{3' end read count} ); +$three_prime_ends = choose_three_prime_end( + { + seq_name => '1', + regions => [ [ 1000, 2000, 10, -10, [ [ '1', 900, -1, 20 ], ] ] ], + } +); +is( scalar keys %{$three_prime_ends}, 1, '1 sequence' ); +is( scalar @{ $three_prime_ends->{'1'} }, 1, '1 region' ); +is( $three_prime_ends->{'1'}->[0]->[0], 1000, 'Region start' ); +is( $three_prime_ends->{'1'}->[0]->[1], 2000, 'Region end' ); +is( $three_prime_ends->{'1'}->[0]->[2], 10, 'Region maximum read count' ); +is( $three_prime_ends->{'1'}->[0]->[3], -10, 'Region log probability sum' ); +is( $three_prime_ends->{'1'}->[0]->[4], '1', q{3' end sequence} ); +is( $three_prime_ends->{'1'}->[0]->[5], 900, q{3' end position} ); +is( $three_prime_ends->{'1'}->[0]->[6], -1, q{3' end strand} ); +is( $three_prime_ends->{'1'}->[0]->[7], 20, q{3' end read count} ); + +# Test choosing 3' end beyond region end +$three_prime_ends = choose_three_prime_end( + { + seq_name => '1', + regions => [ [ 1000, 2000, 10, -10, [ [ '1', 2100, -1, 20 ], ] ] ], + } +); +is( scalar keys %{$three_prime_ends}, 1, '1 sequence' ); +is( scalar @{ $three_prime_ends->{'1'} }, 1, '1 region' ); +is( $three_prime_ends->{'1'}->[0]->[0], 1000, 'Region start' ); +is( $three_prime_ends->{'1'}->[0]->[1], 2000, 'Region end' ); +is( $three_prime_ends->{'1'}->[0]->[2], 10, 'Region maximum read count' ); +is( $three_prime_ends->{'1'}->[0]->[3], -10, 'Region log probability sum' ); +is( $three_prime_ends->{'1'}->[0]->[4], '1', q{3' end sequence} ); +is( $three_prime_ends->{'1'}->[0]->[5], 2100, q{3' end position} ); +is( $three_prime_ends->{'1'}->[0]->[6], -1, q{3' end strand} ); +is( $three_prime_ends->{'1'}->[0]->[7], 20, q{3' end read count} ); +$three_prime_ends = choose_three_prime_end( + { + seq_name => '1', + regions => [ [ 1000, 2000, 10, -10, [ [ '1', 2100, 1, 20 ], ] ] ], + } +); +is( scalar keys %{$three_prime_ends}, 1, '1 sequence' ); +is( scalar @{ $three_prime_ends->{'1'} }, 1, '1 region' ); +is( $three_prime_ends->{'1'}->[0]->[0], 1000, 'Region start' ); +is( $three_prime_ends->{'1'}->[0]->[1], 2000, 'Region end' ); +is( $three_prime_ends->{'1'}->[0]->[2], 10, 'Region maximum read count' ); +is( $three_prime_ends->{'1'}->[0]->[3], -10, 'Region log probability sum' ); +is( $three_prime_ends->{'1'}->[0]->[4], '1', q{3' end sequence} ); +is( $three_prime_ends->{'1'}->[0]->[5], 2100, q{3' end position} ); +is( $three_prime_ends->{'1'}->[0]->[6], 1, q{3' end strand} ); +is( $three_prime_ends->{'1'}->[0]->[7], 20, q{3' end read count} ); + +# Test choosing 3' end with same read count +$three_prime_ends = choose_three_prime_end( + { + seq_name => '1', + regions => [ + [ + 1000, 2000, 10, -10, + [ [ '1', 900, -1, 20 ], [ '1', 2200, -1, 20 ], ] + ] + ], + } +); +is( scalar keys %{$three_prime_ends}, 1, '1 sequence' ); +is( scalar @{ $three_prime_ends->{'1'} }, 1, '1 region' ); +is( $three_prime_ends->{'1'}->[0]->[0], 1000, 'Region start' ); +is( $three_prime_ends->{'1'}->[0]->[1], 2000, 'Region end' ); +is( $three_prime_ends->{'1'}->[0]->[2], 10, 'Region maximum read count' ); +is( $three_prime_ends->{'1'}->[0]->[3], -10, 'Region log probability sum' ); +is( $three_prime_ends->{'1'}->[0]->[4], '1', q{3' end sequence} ); +is( $three_prime_ends->{'1'}->[0]->[5], 900, q{3' end position} ); +is( $three_prime_ends->{'1'}->[0]->[6], -1, q{3' end strand} ); +is( $three_prime_ends->{'1'}->[0]->[7], 20, q{3' end read count} ); +$three_prime_ends = choose_three_prime_end( + { + seq_name => '1', + regions => [ + [ + 1000, 2000, 10, -10, + [ [ '1', 900, -1, 20 ], [ '1', 2100, -1, 20 ], ] + ] + ], + } +); +is( scalar keys %{$three_prime_ends}, 1, '1 sequence' ); +is( scalar @{ $three_prime_ends->{'1'} }, 1, '1 region' ); +is( $three_prime_ends->{'1'}->[0]->[0], 1000, 'Region start' ); +is( $three_prime_ends->{'1'}->[0]->[1], 2000, 'Region end' ); +is( $three_prime_ends->{'1'}->[0]->[2], 10, 'Region maximum read count' ); +is( $three_prime_ends->{'1'}->[0]->[3], -10, 'Region log probability sum' ); +is( $three_prime_ends->{'1'}->[0]->[4], '1', q{3' end sequence} ); +is( $three_prime_ends->{'1'}->[0]->[6], -1, q{3' end strand} ); +is( $three_prime_ends->{'1'}->[0]->[7], 20, q{3' end read count} ); +$three_prime_ends = choose_three_prime_end( + { + seq_name => '1', + regions => [ + [ + 1000, 2000, 10, -10, + [ [ '2', 900, -1, 20 ], [ '2', 2100, -1, 20 ], ] + ] + ], + } +); +is( scalar keys %{$three_prime_ends}, 1, '1 sequence' ); +is( scalar @{ $three_prime_ends->{'1'} }, 1, '1 region' ); +is( $three_prime_ends->{'1'}->[0]->[0], 1000, 'Region start' ); +is( $three_prime_ends->{'1'}->[0]->[1], 2000, 'Region end' ); +is( $three_prime_ends->{'1'}->[0]->[2], 10, 'Region maximum read count' ); +is( $three_prime_ends->{'1'}->[0]->[3], -10, 'Region log probability sum' ); +is( $three_prime_ends->{'1'}->[0]->[4], '2', q{3' end sequence} ); +is( $three_prime_ends->{'1'}->[0]->[6], -1, q{3' end strand} ); +is( $three_prime_ends->{'1'}->[0]->[7], 20, q{3' end read count} ); + +# Test checking for polyA +is( DETCT::Misc::BAM::is_polya('TTTTTTTTTT'), 0, 'PolyT' ); +is( DETCT::Misc::BAM::is_polya('AAAATTTTTT'), 1, '>3 As at start' ); +is( DETCT::Misc::BAM::is_polya('TTTTTTAAAA'), 0, '>3 As at end' ); +is( DETCT::Misc::BAM::is_polya('TAAAATAAAT'), 1, '>6 As' ); +is( DETCT::Misc::BAM::is_polya('AAATAAATTT'), 1, 'AAA.AAA... regexp' ); +is( DETCT::Misc::BAM::is_polya('AAATAATATT'), 1, 'AAA.AA.A.. regexp' ); +is( DETCT::Misc::BAM::is_polya('AAATATAATT'), 1, 'AAA.A.AA.. regexp' ); +is( DETCT::Misc::BAM::is_polya('AATAAAATTT'), 1, 'AA.AAAA... regexp' ); +is( DETCT::Misc::BAM::is_polya('AATAAATATT'), 1, 'AA.AAA.A.. regexp' ); +is( DETCT::Misc::BAM::is_polya('AATATAAATT'), 1, 'AA.A.AAA.. regexp' ); +is( DETCT::Misc::BAM::is_polya('ATAAAAATTT'), 1, 'A.AAAAA... regexp' ); +is( DETCT::Misc::BAM::is_polya('ATAAAATATT'), 1, 'A.AAAA.A.. regexp' ); +is( DETCT::Misc::BAM::is_polya('ATAAATAATT'), 1, 'A.AAA.AA.. regexp' ); +is( DETCT::Misc::BAM::is_polya('ATAATAAATT'), 1, 'A.AA.AAA.. regexp' ); +is( DETCT::Misc::BAM::is_polya('ATATAAAATT'), 1, 'A.A.AAAA.. regexp' ); +is( DETCT::Misc::BAM::is_polya('AATAATAATT'), 1, 'AA.AA.AA.. regexp' ); +is( DETCT::Misc::BAM::is_polya('TATAATAATA'), 0, '6 As' ); + +# Check counting reads required parameters +throws_ok { + count_reads( + { + mismatch_threshold => 2, + seq_name => '1', + regions => [ [ 1000, 2000, 10, -10, '1', 2000, 1, 10 ], ], + tags => ['NNNNBGAGGC'], + } + ); +} +qr/No BAM file specified/ms, 'No BAM file'; +throws_ok { + count_reads( + { + bam_file => 't/data/test1.bam', + seq_name => '1', + regions => [ [ 1000, 2000, 10, -10, '1', 2000, 1, 10 ], ], + tags => ['NNNNBGAGGC'], + } + ); +} +qr/No mismatch threshold specified/ms, 'No mismatch threshold'; +throws_ok { + count_reads( + { + bam_file => 't/data/test1.bam', + mismatch_threshold => 2, + regions => [ [ 1000, 2000, 10, -10, '1', 2000, 1, 10 ], ], + tags => ['NNNNBGAGGC'], + } + ); +} +qr/No sequence name specified/ms, 'No sequence name'; +throws_ok { + count_reads( + { + bam_file => 't/data/test1.bam', + mismatch_threshold => 2, + seq_name => '1', + tags => ['NNNNBGAGGC'], + } + ); +} +qr/No regions specified/ms, 'No regions'; +throws_ok { + count_reads( + { + bam_file => 't/data/test1.bam', + mismatch_threshold => 0, + seq_name => '1', + regions => [ [ 1, 2000, 10, -10, '1', 2000, 1, 10 ], ], + } + ); +} +qr/No tags specified/ms, 'No tags'; + +# Check read counts returned by test BAM file +# Should be 11 reads according to: + +=for comment +samtools view -f 128 -F 1028 t/data/test1.bam 1:1-2000 \ +| grep 54M | grep NM:i:0 | awk '{ print $1 }' \ +| sed -e 's/.*#//' | grep GAGGC$ | wc -l +=cut + +$three_prime_ends = count_reads( + { + bam_file => 't/data/test1.bam', + mismatch_threshold => 0, + seq_name => '1', + regions => [ [ 1, 2000, 10, -10, '1', 2000, 1, 10 ], ], + tags => ['NNNNBGAGGC'], + } +); +is( scalar keys %{$three_prime_ends}, 1, '1 sequence' ); +is( scalar @{ $three_prime_ends->{'1'} }, 1, '1 region' ); +is( $three_prime_ends->{'1'}->[0]->[0], 1, 'Region start' ); +is( $three_prime_ends->{'1'}->[0]->[1], 2000, 'Region end' ); +is( $three_prime_ends->{'1'}->[0]->[2], 10, 'Region maximum read count' ); +is( $three_prime_ends->{'1'}->[0]->[3], -10, 'Region log probability sum' ); +is( $three_prime_ends->{'1'}->[0]->[4], '1', q{3' end sequence} ); +is( $three_prime_ends->{'1'}->[0]->[5], 2000, q{3' end position} ); +is( $three_prime_ends->{'1'}->[0]->[6], 1, q{3' end strand} ); +is( $three_prime_ends->{'1'}->[0]->[7], 10, q{3' end read count} ); +is( scalar keys %{ $three_prime_ends->{'1'}->[0]->[8] }, 1, '1 tag' ); +is( $three_prime_ends->{'1'}->[0]->[8]->{NNNNBGAGGC}, 4, '4 reads' ); + +# Mock sample objects +my $sample1 = Test::MockObject->new(); +$sample1->set_isa('DETCT::Sample'); +$sample1->set_always( 'bam_file', '1.bam' ); +$sample1->set_always( 'tag', 'AA' ); +my $sample2 = Test::MockObject->new(); +$sample2->set_isa('DETCT::Sample'); +$sample2->set_always( 'bam_file', '2.bam' ); +$sample2->set_always( 'tag', 'TT' ); +my $samples = [ $sample1, $sample2 ]; + +# Check merging read counts required parameters +throws_ok { + merge_read_counts( + { + regions => { + '1.bam' => + [ [ 1, 1000, 10, -10, '1', 2000, 1, 10, { AA => 10 } ] ], + }, + samples => $samples, + } + ); +} +qr/No sequence name specified/ms, 'No sequence name'; +throws_ok { + merge_read_counts( + { + seq_name => '1', + samples => $samples, + } + ); +} +qr/No regions specified/ms, 'No regions'; +throws_ok { + merge_read_counts( + { + seq_name => '1', + regions => { + '1.bam' => + [ [ 1, 1000, 10, -10, '1', 2000, 1, 10, { AA => 10 } ] ], + }, + } + ); +} +qr/No samples specified/ms, 'No samples'; + +# Test lists with different number of regions +throws_ok { + merge_read_counts( + { + seq_name => '1', + regions => { + '1.bam' => [ + [ 1, 1000, 10, -10, '1', 2000, 1, 10, { AA => 10 } ], + [ 3000, 4000, 10, -10, '1', 5000, 1, 10, { AA => 10 } ], + ], + '2.bam' => + [ [ 1, 1000, 10, -10, '1', 2000, 1, 10, { TT => 10 } ] ], + }, + samples => $samples, + } + ); +} +qr/Number of regions does not match in all lists/ms, + 'Different number of regions'; + +# Test lists with different regions +throws_ok { + merge_read_counts( + { + seq_name => '1', + regions => { + '1.bam' => + [ [ 1, 1000, 10, -10, '1', 2000, 1, 10, { AA => 10 } ] ], + '2.bam' => + [ [ 2, 1000, 10, -10, '1', 2000, 1, 10, { TT => 10 } ] ], + }, + samples => $samples, + } + ); +} +qr/Regions not in the same order or not the same in each list/ms, + 'Different region start'; +throws_ok { + merge_read_counts( + { + seq_name => '1', + regions => { + '1.bam' => + [ [ 1, 1000, 10, -10, '1', 2000, 1, 10, { AA => 10 } ] ], + '2.bam' => + [ [ 1, 1001, 10, -10, '1', 2000, 1, 10, { TT => 10 } ] ], + }, + samples => $samples, + } + ); +} +qr/Regions not in the same order or not the same in each list/ms, + 'Different region end'; +throws_ok { + merge_read_counts( + { + seq_name => '1', + regions => { + '1.bam' => + [ [ 1, 1000, 10, -10, '1', 2000, 1, 10, { AA => 10 } ] ], + '2.bam' => + [ [ 1, 1000, 11, -10, '1', 2000, 1, 10, { TT => 10 } ] ], + }, + samples => $samples, + } + ); +} +qr/Regions not in the same order or not the same in each list/ms, + 'Different region maximum read count'; +throws_ok { + merge_read_counts( + { + seq_name => '1', + regions => { + '1.bam' => + [ [ 1, 1000, 10, -10, '1', 2000, 1, 10, { AA => 10 } ] ], + '2.bam' => + [ [ 1, 1000, 10, -11, '1', 2000, 1, 10, { TT => 10 } ] ], + }, + samples => $samples, + } + ); +} +qr/Regions not in the same order or not the same in each list/ms, + 'Different region log probability sum'; +throws_ok { + merge_read_counts( + { + seq_name => '1', + regions => { + '1.bam' => + [ [ 1, 1000, 10, -10, '1', 2000, 1, 10, { AA => 10 } ] ], + '2.bam' => + [ [ 1, 1000, 10, -10, '2', 2000, 1, 10, { TT => 10 } ] ], + }, + samples => $samples, + } + ); +} +qr/Regions not in the same order or not the same in each list/ms, + q{Different 3' end sequence}; +throws_ok { + merge_read_counts( + { + seq_name => '1', + regions => { + '1.bam' => + [ [ 1, 1000, 10, -10, '1', 2000, 1, 10, { AA => 10 } ] ], + '2.bam' => + [ [ 1, 1000, 10, -10, '1', 2001, 1, 10, { TT => 10 } ] ], + }, + samples => $samples, + } + ); +} +qr/Regions not in the same order or not the same in each list/ms, + q{Different 3' end position}; +throws_ok { + merge_read_counts( + { + seq_name => '1', + regions => { + '1.bam' => + [ [ 1, 1000, 10, -10, '1', 2000, 1, 10, { AA => 10 } ] ], + '2.bam' => + [ [ 1, 1000, 10, -10, '1', 2000, -1, 10, { TT => 10 } ] ], + }, + samples => $samples, + } + ); +} +qr/Regions not in the same order or not the same in each list/ms, + q{Different 3' end strand}; +throws_ok { + merge_read_counts( + { + seq_name => '1', + regions => { + '1.bam' => + [ [ 1, 1000, 10, -10, '1', 2000, 1, 10, { AA => 10 } ] ], + '2.bam' => + [ [ 1, 1000, 10, -10, '1', 2000, 1, 11, { TT => 10 } ] ], + }, + samples => $samples, + } + ); +} +qr/Regions not in the same order or not the same in each list/ms, + q{Different 3' end read count}; +throws_ok { + merge_read_counts( + { + seq_name => '1', + regions => { + '1.bam' => + [ [ 1, 1000, 10, -10, undef, 2000, 1, 10, { AA => 10 } ] ], + '2.bam' => + [ [ 1, 1000, 10, -10, '1', 2000, 1, 10, { TT => 10 } ] ], + }, + samples => $samples, + } + ); +} +qr/Regions not in the same order or not the same in each list/ms, + q{3' end sequence undefined}; +throws_ok { + merge_read_counts( + { + seq_name => '1', + regions => { + '1.bam' => + [ [ 1, 1000, 10, -10, '1', 2000, 1, 10, { AA => 10 } ] ], + '2.bam' => + [ [ 1, 1000, 10, -10, undef, 2000, 1, 10, { TT => 10 } ] ], + }, + samples => $samples, + } + ); +} +qr/Regions not in the same order or not the same in each list/ms, + q{Other 3' end sequence undefined}; +throws_ok { + merge_read_counts( + { + seq_name => '1', + regions => { + '1.bam' => + [ [ 1, 1000, 10, -10, '1', undef, 1, 10, { AA => 10 } ] ], + '2.bam' => + [ [ 1, 1000, 10, -10, '1', 2000, 1, 10, { TT => 10 } ] ], + }, + samples => $samples, + } + ); +} +qr/Regions not in the same order or not the same in each list/ms, + q{3' end position undefined}; +throws_ok { + merge_read_counts( + { + seq_name => '1', + regions => { + '1.bam' => + [ [ 1, 1000, 10, -10, '1', 2000, 1, 10, { AA => 10 } ] ], + '2.bam' => + [ [ 1, 1000, 10, -10, '1', undef, 1, 10, { TT => 10 } ] ], + }, + samples => $samples, + } + ); +} +qr/Regions not in the same order or not the same in each list/ms, + q{Other 3' end position undefined}; +throws_ok { + merge_read_counts( + { + seq_name => '1', + regions => { + '1.bam' => + [ [ 1, 1000, 10, -10, '1', 2000, undef, 10, { AA => 10 } ] ], + '2.bam' => + [ [ 1, 1000, 10, -10, '1', 2000, 1, 10, { TT => 10 } ] ], + }, + samples => $samples, + } + ); +} +qr/Regions not in the same order or not the same in each list/ms, + q{3' end strand undefined}; +throws_ok { + merge_read_counts( + { + seq_name => '1', + regions => { + '1.bam' => + [ [ 1, 1000, 10, -10, '1', 2000, 1, 10, { AA => 10 } ] ], + '2.bam' => + [ [ 1, 1000, 10, -10, '1', 2000, undef, 10, { TT => 10 } ] ], + }, + samples => $samples, + } + ); +} +qr/Regions not in the same order or not the same in each list/ms, + q{Other 3' end strand undefined}; +throws_ok { + merge_read_counts( + { + seq_name => '1', + regions => { + '1.bam' => + [ [ 1, 1000, 10, -10, '1', 2000, 1, undef, { AA => 10 } ] ], + '2.bam' => + [ [ 1, 1000, 10, -10, '1', 2000, 1, 10, { TT => 10 } ] ], + }, + samples => $samples, + } + ); +} +qr/Regions not in the same order or not the same in each list/ms, + q{3' end read count undefined}; +throws_ok { + merge_read_counts( + { + seq_name => '1', + regions => { + '1.bam' => + [ [ 1, 1000, 10, -10, '1', 2000, 1, 10, { AA => 10 } ] ], + '2.bam' => + [ [ 1, 1000, 10, -10, '1', 2000, 1, undef, { TT => 10 } ] ], + }, + samples => $samples, + } + ); +} +qr/Regions not in the same order or not the same in each list/ms, + q{Other 3' end read count undefined}; + +# Test unknown BAM file and/or tag +throws_ok { + merge_read_counts( + { + seq_name => '1', + regions => { + '1.bam' => + [ [ 1, 1000, 10, -10, '1', 2000, 1, 10, { AA => 10 } ] ], + '3.bam' => + [ [ 1, 1000, 10, -10, '1', 2000, 1, 10, { TT => 10 } ] ], + }, + samples => $samples, + } + ); +} +qr/Unknown BAM file/ms, q{BAM file not in samples}; +throws_ok { + merge_read_counts( + { + seq_name => '1', + regions => { + '1.bam' => + [ [ 1, 1000, 10, -10, '1', 2000, 1, 10, { AA => 10 } ] ], + '2.bam' => + [ [ 1, 1000, 10, -10, '1', 2000, 1, 10, { CC => 10 } ] ], + }, + samples => $samples, + } + ); +} +qr/Unknown BAM file/ms, q{Tag not in samples}; +throws_ok { + merge_read_counts( + { + seq_name => '1', + regions => { + '1.bam' => + [ [ 1, 1000, 10, -10, '1', 2000, 1, 10, { TT => 10 } ] ], + '2.bam' => + [ [ 1, 1000, 10, -10, '1', 2000, 1, 10, { AA => 10 } ] ], + }, + samples => $samples, + } + ); +} +qr/Unknown BAM file/ms, q{Combination of BAM file and tag not in samples}; + +my $read_counts; + +$read_counts = merge_read_counts( + { + seq_name => '1', + regions => { + '1.bam' => [ [ 1, 1000, 10, -10, '1', 2000, 1, 10, { AA => 10 } ] ], + '2.bam' => [ [ 1, 1000, 10, -10, '1', 2000, 1, 10, { TT => 20 } ] ], + }, + samples => $samples, + } +); +is( scalar keys %{$read_counts}, 1, '1 sequence' ); +is( scalar @{ $read_counts->{'1'} }, 1, '1 region' ); +is( $read_counts->{'1'}->[0]->[0], 1, 'Region start' ); +is( $read_counts->{'1'}->[0]->[1], 1000, 'Region end' ); +is( $read_counts->{'1'}->[0]->[2], 10, 'Region maximum read count' ); +is( $read_counts->{'1'}->[0]->[3], -10, 'Region log probability sum' ); +is( $read_counts->{'1'}->[0]->[4], '1', q{3' end sequence} ); +is( $read_counts->{'1'}->[0]->[5], 2000, q{3' end position} ); +is( $read_counts->{'1'}->[0]->[6], 1, q{3' end strand} ); +is( $read_counts->{'1'}->[0]->[7], 10, q{3' end read count} ); +is( scalar @{ $read_counts->{'1'}->[0]->[8] }, 2, '2 samples' ); +is( $read_counts->{'1'}->[0]->[8]->[0], 10, '10 reads' ); +is( $read_counts->{'1'}->[0]->[8]->[1], 20, '20 reads' ); + +$read_counts = merge_read_counts( + { + seq_name => '1', + regions => { + '1.bam' => [ + [ 1, 1000, 10, -10, undef, undef, undef, undef, { AA => 10 } ] + ], + '2.bam' => [ + [ 1, 1000, 10, -10, undef, undef, undef, undef, { TT => 20 } ] + ], + }, + samples => $samples, + } +); +is( scalar keys %{$read_counts}, 1, '1 sequence' ); +is( scalar @{ $read_counts->{'1'} }, 1, '1 region' ); +is( $read_counts->{'1'}->[0]->[0], 1, 'Region start' ); +is( $read_counts->{'1'}->[0]->[1], 1000, 'Region end' ); +is( $read_counts->{'1'}->[0]->[2], 10, 'Region maximum read count' ); +is( $read_counts->{'1'}->[0]->[3], -10, 'Region log probability sum' ); +is( $read_counts->{'1'}->[0]->[4], undef, q{3' end sequence} ); +is( $read_counts->{'1'}->[0]->[5], undef, q{3' end position} ); +is( $read_counts->{'1'}->[0]->[6], undef, q{3' end strand} ); +is( $read_counts->{'1'}->[0]->[7], undef, q{3' end read count} ); +is( scalar @{ $read_counts->{'1'}->[0]->[8] }, 2, '2 samples' ); +is( $read_counts->{'1'}->[0]->[8]->[0], 10, '10 reads' ); +is( $read_counts->{'1'}->[0]->[8]->[1], 20, '20 reads' ); diff --git a/t/misc-output.t b/t/misc-output.t new file mode 100644 index 0000000..555bfd8 --- /dev/null +++ b/t/misc-output.t @@ -0,0 +1,98 @@ +use Test::More; +use Test::Exception; +use Test::Warn; +use Test::DatabaseRow; +use Test::MockObject; +use Carp; + +plan tests => 13; + +use DETCT::Misc::Output qw( + dump_as_table +); + +use File::Temp qw( tempdir ); +use File::Spec; + +my $tmp_dir = tempdir( CLEANUP => 1 ); + +# Mock sample objects +my $sample1 = Test::MockObject->new(); +$sample1->set_isa('DETCT::Sample'); +$sample1->set_always( 'name', 'wt1' ); +$sample1->set_always( 'condition', 'sibling' ); +$sample1->set_always( 'group', '1' ); +my $sample2 = Test::MockObject->new(); +$sample2->set_isa('DETCT::Sample'); +$sample2->set_always( 'name', 'wt2' ); +$sample2->set_always( 'condition', 'sibling' ); +$sample2->set_always( 'group', '2' ); +my $sample3 = Test::MockObject->new(); +$sample3->set_isa('DETCT::Sample'); +$sample3->set_always( 'name', 'mut1' ); +$sample3->set_always( 'condition', 'mutant' ); +$sample3->set_always( 'group', '1' ); +my $sample4 = Test::MockObject->new(); +$sample4->set_isa('DETCT::Sample'); +$sample4->set_always( 'name', 'mut2' ); +$sample4->set_always( 'condition', 'mutant' ); +$sample4->set_always( 'group', '2' ); +my $samples = [ $sample1, $sample2, $sample3, $sample4 ]; + +# Mock analysis object +my $analysis = Test::MockObject->new(); +$analysis->set_isa('DETCT::Analysis'); +$analysis->set_always( 'get_all_samples', $samples ); +$analysis->set_always( 'ensembl_species', 'danio_rerio' ); + +my $regions = [ + [ + '1', 1, 110, 10, -10, '1', 110, + 1, 10, + [ 4, 1, 2, 7 ], + [ 4.6, 1.1, 2.1, 4.6 ], + undef, undef, + [ 1.18, 0.233 ], + [ [ 0.46, -1.13 ], [ 4.18, 2.06 ] ], + { + e61 => [ + [ + 'ENSDARG00000095747', + 'cxc64', + 'CXC chemokine 64', + 'protein_coding', + 5, + [ [ 'ENSDART00000133571', 'protein_coding' ] ] + ] + ] + } + ], + [ + '1', 1, + 1000, 10, + -10, undef, + undef, undef, + undef, [ 4, 1, 2, 7 ], + [ 4.6, 1.1, 2.1, 4.6 ], undef, + undef, [ 1.18, 0.233 ], + [ [ 0.46, -1.13 ], [ 4.18, 2.06 ] ], {} + ], +]; + +is( + dump_as_table( + { analysis => $analysis, dir => $tmp_dir, regions => $regions, } + ), + undef, 'Dump' +); + +foreach my $format ( 'csv', 'tsv', 'html' ) { + foreach my $level ( 'all', 'sig' ) { + my $file = $level . q{.} . $format; + my $filepath = File::Spec->catfile( $tmp_dir, $file ); + ok( -e $filepath, $file . ' exists' ); + ok( !-z $filepath, $file . ' is not empty' ); + } +} + +# TODO: Actually test output diff --git a/t/misc-peakhmm.t b/t/misc-peakhmm.t new file mode 100644 index 0000000..3764343 --- /dev/null +++ b/t/misc-peakhmm.t @@ -0,0 +1,572 @@ +use Test::More; +use Test::Exception; +use Test::Warn; +use Test::DatabaseRow; +use Test::MockObject; +use Carp; + +plan tests => 103; + +use DETCT::Misc::PeakHMM qw( + merge_read_peaks + summarise_read_peaks + run_peak_hmm + join_hmm_bins +); + +use File::Temp qw( tempdir ); +use File::Path qw( make_path ); +use POSIX qw( WIFEXITED); + +# Compile quince_chiphmmnew if necessary +if ( !-r 'bin/quince_chiphmmnew' ) { + make_path('bin'); + my $cmd = 'g++ -o bin/quince_chiphmmnew src/quince_chiphmmnew.cpp'; + WIFEXITED( system $cmd) or confess "Couldn't run $cmd"; +} + +my $input_peaks; +my $output_peaks; + +# Check merging read peaks required parameters +throws_ok { + merge_read_peaks( + { + seq_name => 1, + peaks => [ [ 1, 2, 1 ] ], + } + ); +} +qr/No peak buffer width specified/ms, 'No peak buffer width'; +throws_ok { + merge_read_peaks( + { + peak_buffer_width => 100, + peaks => [ [ 1, 2, 1 ] ], + } + ); +} +qr/No sequence name specified/ms, 'No sequence name'; +throws_ok { + merge_read_peaks( + { + peak_buffer_width => 100, + seq_name => 1, + } + ); +} +qr/No peaks specified/ms, 'No peaks'; + +# Two peaks but no merging +$input_peaks = [ [ 100, 200, 1 ], [ 500, 600, 1 ], ]; +$output_peaks = merge_read_peaks( + { + peak_buffer_width => 100, + seq_name => 1, + peaks => $input_peaks, + } +); +is( scalar keys %{$output_peaks}, 1, '1 sequence' ); +is( scalar @{ $output_peaks->{'1'} }, 2, '2 peaks' ); +is( $output_peaks->{'1'}->[0]->[0], 100, 'Start of first peak' ); +is( $output_peaks->{'1'}->[0]->[1], 200, 'End of first peak' ); +is( $output_peaks->{'1'}->[0]->[2], 1, 'First peak read count' ); +is( $output_peaks->{'1'}->[-1]->[0], 500, 'Start of last peak' ); +is( $output_peaks->{'1'}->[-1]->[1], 600, 'End of last peak' ); +is( $output_peaks->{'1'}->[-1]->[2], 1, 'Last peak read count' ); + +# Two peaks merged into one +$input_peaks = [ [ 100, 200, 1 ], [ 250, 350, 1 ], ]; +$output_peaks = merge_read_peaks( + { + peak_buffer_width => 100, + seq_name => 1, + peaks => $input_peaks, + } +); +is( scalar keys %{$output_peaks}, 1, '1 sequence' ); +is( scalar @{ $output_peaks->{'1'} }, 1, '1 peak' ); +is( $output_peaks->{'1'}->[0]->[0], 100, 'Start of first peak' ); +is( $output_peaks->{'1'}->[0]->[1], 350, 'End of first peak' ); +is( $output_peaks->{'1'}->[0]->[2], 2, 'First peak read count' ); + +# Three peaks with first two merged +$input_peaks = [ [ 100, 200, 1 ], [ 250, 350, 1 ], [ 500, 600, 1 ], ]; +$output_peaks = merge_read_peaks( + { + peak_buffer_width => 100, + seq_name => 1, + peaks => $input_peaks, + } +); +is( scalar keys %{$output_peaks}, 1, '1 sequence' ); +is( scalar @{ $output_peaks->{'1'} }, 2, '2 peaks' ); +is( $output_peaks->{'1'}->[0]->[0], 100, 'Start of first peak' ); +is( $output_peaks->{'1'}->[0]->[1], 350, 'End of first peak' ); +is( $output_peaks->{'1'}->[0]->[2], 2, 'First peak read count' ); +is( $output_peaks->{'1'}->[-1]->[0], 500, 'Start of last peak' ); +is( $output_peaks->{'1'}->[-1]->[1], 600, 'End of last peak' ); +is( $output_peaks->{'1'}->[-1]->[2], 1, 'Last peak read count' ); + +# Three peaks with second two merged +$input_peaks = [ [ 100, 200, 1 ], [ 500, 600, 1 ], [ 550, 650, 1 ], ]; +$output_peaks = merge_read_peaks( + { + peak_buffer_width => 100, + seq_name => 1, + peaks => $input_peaks, + } +); +is( scalar keys %{$output_peaks}, 1, '1 sequence' ); +is( scalar @{ $output_peaks->{'1'} }, 2, '2 peaks' ); +is( $output_peaks->{'1'}->[0]->[0], 100, 'Start of first peak' ); +is( $output_peaks->{'1'}->[0]->[1], 200, 'End of first peak' ); +is( $output_peaks->{'1'}->[0]->[2], 1, 'First peak read count' ); +is( $output_peaks->{'1'}->[-1]->[0], 500, 'Start of last peak' ); +is( $output_peaks->{'1'}->[-1]->[1], 650, 'End of last peak' ); +is( $output_peaks->{'1'}->[-1]->[2], 2, 'Last peak read count' ); + +# Two peaks separated by buffer width +$input_peaks = [ [ 100, 200, 1 ], [ 300, 400, 1 ], ]; +$output_peaks = merge_read_peaks( + { + peak_buffer_width => 100, + seq_name => 1, + peaks => $input_peaks, + } +); +is( scalar keys %{$output_peaks}, 1, '1 sequence' ); +is( scalar @{ $output_peaks->{'1'} }, 2, '2 peaks' ); +is( $output_peaks->{'1'}->[0]->[0], 100, 'Start of first peak' ); +is( $output_peaks->{'1'}->[0]->[1], 200, 'End of first peak' ); +is( $output_peaks->{'1'}->[0]->[2], 1, 'First peak read count' ); +is( $output_peaks->{'1'}->[-1]->[0], 300, 'Start of last peak' ); +is( $output_peaks->{'1'}->[-1]->[1], 400, 'End of last peak' ); +is( $output_peaks->{'1'}->[-1]->[2], 1, 'Last peak read count' ); + +# Two peaks separated by just under buffer width +$input_peaks = [ [ 100, 200, 1 ], [ 299, 400, 1 ], ]; +$output_peaks = merge_read_peaks( + { + peak_buffer_width => 100, + seq_name => 1, + peaks => $input_peaks, + } +); +is( scalar keys %{$output_peaks}, 1, '1 sequence' ); +is( scalar @{ $output_peaks->{'1'} }, 1, '1 peak' ); +is( $output_peaks->{'1'}->[0]->[0], 100, 'Start of first peak' ); +is( $output_peaks->{'1'}->[0]->[1], 400, 'End of first peak' ); +is( $output_peaks->{'1'}->[0]->[2], 2, 'First peak read count' ); + +# Two peaks with same start +$input_peaks = [ [ 100, 200, 1 ], [ 100, 300, 1 ], ]; +$output_peaks = merge_read_peaks( + { + peak_buffer_width => 100, + seq_name => 1, + peaks => $input_peaks, + } +); +is( scalar keys %{$output_peaks}, 1, '1 sequence' ); +is( scalar @{ $output_peaks->{'1'} }, 1, '1 peak' ); +is( $output_peaks->{'1'}->[0]->[0], 100, 'Start of first peak' ); +is( $output_peaks->{'1'}->[0]->[1], 300, 'End of first peak' ); +is( $output_peaks->{'1'}->[0]->[2], 2, 'First peak read count' ); + +# No peaks +$input_peaks = []; +$output_peaks = merge_read_peaks( + { + peak_buffer_width => 100, + seq_name => 1, + peaks => $input_peaks, + } +); +is( scalar keys %{$output_peaks}, 1, '1 sequence' ); +is( scalar @{ $output_peaks->{'1'} }, 0, '0 peaks' ); + +my $summary; + +# Check summarising read peaks required parameters +throws_ok { + summarise_read_peaks( + { + peak_buffer_width => 100, + hmm_sig_level => 0.001, + seq_name => '1', + seq_bp => 1000, + read_length => 54, + peaks => [ [ 1, 2, 1 ] ], + } + ); +} +qr/No bin size specified/ms, 'No bin size'; +throws_ok { + summarise_read_peaks( + { + bin_size => 100, + hmm_sig_level => 0.001, + seq_name => '1', + seq_bp => 1000, + read_length => 54, + peaks => [ [ 1, 2, 1 ] ], + } + ); +} +qr/No peak buffer width specified/ms, 'No peak buffer width'; +throws_ok { + summarise_read_peaks( + { + bin_size => 100, + peak_buffer_width => 100, + seq_name => '1', + seq_bp => 1000, + read_length => 54, + peaks => [ [ 1, 2, 1 ] ], + } + ); +} +qr/No HMM significance level specified/ms, 'No HMM significance level'; +throws_ok { + summarise_read_peaks( + { + bin_size => 100, + peak_buffer_width => 100, + hmm_sig_level => 0.001, + seq_bp => 1000, + read_length => 54, + peaks => [ [ 1, 2, 1 ] ], + } + ); +} +qr/No sequence name specified/ms, 'No sequence name'; +throws_ok { + summarise_read_peaks( + { + bin_size => 100, + peak_buffer_width => 100, + hmm_sig_level => 0.001, + seq_name => '1', + read_length => 54, + peaks => [ [ 1, 2, 1 ] ], + } + ); +} +qr/No sequence bp specified/ms, 'No sequence bp'; +throws_ok { + summarise_read_peaks( + { + bin_size => 100, + peak_buffer_width => 100, + hmm_sig_level => 0.001, + seq_name => '1', + seq_bp => 1000, + peaks => [ [ 1, 2, 1 ] ], + } + ); +} +qr/No read length specified/ms, 'No read length'; +throws_ok { + summarise_read_peaks( + { + bin_size => 100, + peak_buffer_width => 100, + hmm_sig_level => 0.001, + seq_name => '1', + seq_bp => 1000, + read_length => 54, + } + ); +} +qr/No peaks specified/ms, 'No peaks'; + +# Two peaks, one significant +$input_peaks = [ [ 100, 199, 5 ], [ 300, 399, 1 ], ]; +$summary = summarise_read_peaks( + { + bin_size => 100, + peak_buffer_width => 100, + hmm_sig_level => 0.001, + seq_name => '1', + seq_bp => 1000, + read_length => 54, + peaks => $input_peaks, + } +); +is( scalar keys %{$summary}, 1, '1 sequence' ); +is( scalar keys %{ $summary->{'1'} }, 9, '9 keys' ); +is( + $summary->{'1'}->{total_read_count_per_mb}, + 6 / 1_000_000, + 'Total read count per Mb' +); +is( + $summary->{'1'}->{total_sig_read_count_per_mb}, + 5 / 1_000_000, + 'Total significant read count per Mb' +); +is( + $summary->{'1'}->{total_sig_peak_width_in_mb}, + 100 / 1_000_000, + 'Total significant peak width in Mb' +); +is( $summary->{'1'}->{median_sig_peak_width}, + 100, 'Median significant peak width' ); +is( $summary->{'1'}->{total_sig_peaks}, 1, 'Total significant peaks' ); +is( $summary->{'1'}->{peak_buffer_width}, 100, 'Peak buffer width' ); +ok( $summary->{'1'}->{read_threshold} < 5, 'Read threshold' ); +is( $summary->{'1'}->{bin_size}, 100, 'Bin size' ); +is( $summary->{'1'}->{num_bins}, 10, 'Number of bins' ); + +# No significant peaks +$input_peaks = [ [ 300, 399, 1 ], ]; +$summary = summarise_read_peaks( + { + bin_size => 100, + peak_buffer_width => 100, + hmm_sig_level => 0.001, + seq_name => '1', + seq_bp => 1000, + read_length => 54, + peaks => $input_peaks, + } +); +is( $summary->{'1'}->{median_sig_peak_width}, + 0, 'Median significant peak width' ); + +# Three significant peaks +$input_peaks = [ [ 100, 149, 500 ], [ 300, 399, 500 ], [ 600, 759, 500 ], ]; +$summary = summarise_read_peaks( + { + bin_size => 100, + peak_buffer_width => 100, + hmm_sig_level => 0.001, + seq_name => '1', + seq_bp => 1000, + read_length => 54, + peaks => $input_peaks, + } +); +is( $summary->{'1'}->{median_sig_peak_width}, + 100, 'Median significant peak width' ); + +# No peaks +$summary = summarise_read_peaks( + { + bin_size => 100, + peak_buffer_width => 100, + hmm_sig_level => 0.001, + seq_name => '1', + seq_bp => 1000, + read_length => 54, + peaks => [], + } +); +is( scalar keys %{ $summary->{'1'} }, 0, 'No summary' ); + +my $tmp_dir = tempdir( CLEANUP => 1 ); + +# Check running peak HMM required parameters +my $read_bins = { + 1 => 500, + 3 => 1, +}; +$summary = { + total_read_count_per_mb => 501 / 1_000_000, + total_sig_read_count_per_mb => 500 / 1_000_000, + total_sig_peak_width_in_mb => 100 / 1_000_000, + median_sig_peak_width => 100, + total_sig_peaks => 1, + peak_buffer_width => 100, + read_threshold => 3, + bin_size => 100, + num_bins => 10, +}; +throws_ok { + run_peak_hmm( + { + hmm_sig_level => 0.001, + seq_name => '1', + read_bins => $read_bins, + summary => $summary, + hmm_binary => 'bin/quince_chiphmmnew', + } + ); +} +qr/No directory specified/ms, 'No directory'; +throws_ok { + run_peak_hmm( + { + dir => $tmp_dir, + seq_name => '1', + read_bins => $read_bins, + summary => $summary, + hmm_binary => 'bin/quince_chiphmmnew', + } + ); +} +qr/No HMM significance level specified/ms, 'No HMM significance level'; +throws_ok { + run_peak_hmm( + { + dir => $tmp_dir, + hmm_sig_level => 0.001, + read_bins => $read_bins, + summary => $summary, + hmm_binary => 'bin/quince_chiphmmnew', + } + ); +} +qr/No sequence name specified/ms, 'No sequence name'; +throws_ok { + run_peak_hmm( + { + dir => $tmp_dir, + hmm_sig_level => 0.001, + seq_name => '1', + summary => $summary, + hmm_binary => 'bin/quince_chiphmmnew', + } + ); +} +qr/No read bins specified/ms, 'No read bins'; +throws_ok { + run_peak_hmm( + { + dir => $tmp_dir, + hmm_sig_level => 0.001, + seq_name => '1', + read_bins => $read_bins, + hmm_binary => 'bin/quince_chiphmmnew', + } + ); +} +qr/No summary specified/ms, 'No summary'; +throws_ok { + run_peak_hmm( + { + dir => $tmp_dir, + hmm_sig_level => 0.001, + seq_name => '1', + read_bins => $read_bins, + summary => $summary, + } + ); +} +qr/No HMM binary specified/ms, 'No HMM binary'; + +my $hmm; + +# Run peak HMM +$hmm = run_peak_hmm( + { + dir => $tmp_dir, + hmm_sig_level => 0.001, + seq_name => '1', + read_bins => $read_bins, + summary => $summary, + hmm_binary => 'bin/quince_chiphmmnew', + } +); +is( scalar keys %{$hmm}, 1, '1 sequence' ); +is( scalar @{ $hmm->{'1'} }, 1, '1 peak' ); +is( $hmm->{'1'}->[0]->[0], 1, 'Bin 1' ); +is( $hmm->{'1'}->[0]->[1], 500, '500 reads' ); +ok( $hmm->{'1'}->[0]->[2] < 0, 'Log probability negative' ); + +# Run peak HMM with no summary +$hmm = run_peak_hmm( + { + dir => $tmp_dir, + hmm_sig_level => 0.001, + seq_name => '1', + read_bins => $read_bins, + summary => {}, + hmm_binary => 'bin/quince_chiphmmnew', + } +); +is( scalar keys %{$hmm}, 1, '1 sequence' ); +is( scalar @{ $hmm->{'1'} }, 0, '0 peaks' ); + +# Run peak HMM with non-existent working directory +$hmm = run_peak_hmm( + { + dir => $tmp_dir . '/test', + hmm_sig_level => 0.001, + seq_name => '1', + read_bins => $read_bins, + summary => $summary, + hmm_binary => 'bin/quince_chiphmmnew', + } +); +is( scalar keys %{$hmm}, 1, '1 sequence' ); +is( scalar @{ $hmm->{'1'} }, 1, '1 peak' ); + +# Check running peak HMM required parameters +my $hmm_bins = [ + [ 1, 10, -2.3 ], + [ 2, 20, -2.3 ], + [ 4, 10, -2.3 ], + [ 5, 30, -2.3 ], + [ 6, 20, -2.3 ], +]; +throws_ok { + join_hmm_bins( + { + seq_name => '1', + hmm_bins => $hmm_bins, + } + ); +} +qr/No bin size specified/ms, 'No bin size'; +throws_ok { + join_hmm_bins( + { + bin_size => 100, + hmm_bins => $hmm_bins, + } + ); +} +qr/No sequence name specified/ms, 'No sequence name'; +throws_ok { + join_hmm_bins( + { + bin_size => 100, + seq_name => '1', + } + ); +} +qr/No HMM bins specified/ms, 'No HMM bins'; + +my $regions; + +# Five peaks joined to two regions +$regions = join_hmm_bins( + { + bin_size => 100, + seq_name => '1', + hmm_bins => $hmm_bins, + } +); +is( scalar keys %{$regions}, 1, '1 sequence' ); +is( scalar @{ $regions->{'1'} }, 2, '2 peaks' ); +is( $regions->{'1'}->[0]->[0], 101, 'Region 1 start' ); +is( $regions->{'1'}->[0]->[1], 300, 'Region 1 end' ); +is( $regions->{'1'}->[0]->[2], 20, 'Region 1 max read count' ); +is( $regions->{'1'}->[0]->[3], -4.6, 'Region 1 log probability sum' ); +is( $regions->{'1'}->[1]->[0], 401, 'Region 2 start' ); +is( $regions->{'1'}->[1]->[1], 700, 'Region 2 end' ); +is( $regions->{'1'}->[1]->[2], 30, 'Region 2 max read count' ); +is( $regions->{'1'}->[1]->[3], -6.9, 'Region 2 log probability sum' ); + +# No peaks +$regions = join_hmm_bins( + { + bin_size => 100, + seq_name => '1', + hmm_bins => [], + } +); +is( scalar keys %{$regions}, 1, '1 sequence' ); +is( scalar @{ $regions->{'1'} }, 0, '0 peaks' ); diff --git a/t/misc-tag.t b/t/misc-tag.t new file mode 100644 index 0000000..2cf6a8a --- /dev/null +++ b/t/misc-tag.t @@ -0,0 +1,260 @@ +use Test::More; +use Test::Exception; +use Test::Warn; +use Test::DatabaseRow; +use Test::MockObject; +use Carp; + +plan tests => 104; + +use DETCT::Misc::Tag qw( + detag_trim_fastq + convert_tag_to_regexp +); + +use File::Temp qw( tempdir ); +use File::Slurp; + +=for comment + +Test random FASTQ files can be regenerated using: + +perl script/make_test_fastq.pl --seed 1 --output_prefix test1 \ +--read_tags NNNNBGAGGC NNNNBAGAAG +perl script/make_test_fastq.pl --seed 2 --output_prefix test2 \ +--read_tags NNNNBGAGGC NNNNBAGAAG --read_length 54 +mv test* t/data/ + +Some numbers in tests below will then need updating. + +test1 NNNNBGAGGC: 25 +test1 NNNNBAGAAG: 24 +test1 XXXXXXXXXX: 51 + +test2 NNNNBGAGGC: 24 +test2 NNNNBAGAAG: 35 +test2 XXXXXXXXXX: 41 + +=cut + +my $tmp_dir = tempdir( CLEANUP => 1 ); + +# Check detagging and trimming FASTQ files +is( + detag_trim_fastq( + { + fastq_read1_input => 't/data/test1_1.fastq', + fastq_read2_input => 't/data/test1_2.fastq', + fastq_output_prefix => $tmp_dir . '/test1', + pre_detag_trim_length => 54, + polyt_trim_length => 14, + polyt_min_length => 10, + read_tags => [ 'NNNNBGAGGC', 'NNNNBAGAAG' ], + } + ), + undef, + 'Detag and trim FASTQ' +); + +my @fastq; +@fastq = read_file( $tmp_dir . '/test1_NNNNBGAGGC_1.fastq' ); +is( scalar @fastq / 4, 25, '25 read 1s' ); +@fastq = read_file( $tmp_dir . '/test1_NNNNBGAGGC_2.fastq' ); +is( scalar @fastq / 4, 25, '25 read 2s' ); +@fastq = read_file( $tmp_dir . '/test1_NNNNBAGAAG_1.fastq' ); +is( scalar @fastq / 4, 24, '24 read 1s' ); +@fastq = read_file( $tmp_dir . '/test1_NNNNBAGAAG_2.fastq' ); +is( scalar @fastq / 4, 24, '24 read 2s' ); +@fastq = read_file( $tmp_dir . '/test1_XXXXXXXXXX_1.fastq' ); +is( scalar @fastq / 4, 51, '51 read 1s' ); +@fastq = read_file( $tmp_dir . '/test1_XXXXXXXXXX_2.fastq' ); +is( scalar @fastq / 4, 51, '51 read 2s' ); + +@fastq = read_file( $tmp_dir . '/test1_NNNNBGAGGC_1.fastq' ); +my $read_name = $fastq[0]; +chomp $read_name; +is( substr( $read_name, -7 ), 'GAGGC/1', 'Tag added to read name' ); +my $read_seq = $fastq[1]; +chomp $read_seq; +is( length $read_seq, 30, 'Sequence trimmed to 30 bp' ); +my $read_qual = $fastq[3]; +chomp $read_qual; +is( length $read_qual, 30, 'Quality trimmed to 30 bp' ); + +@fastq = read_file( $tmp_dir . '/test1_NNNNBGAGGC_2.fastq' ); +my $read_name = $fastq[0]; +chomp $read_name; +is( substr( $read_name, -7 ), 'GAGGC/2', 'Tag added to read name' ); +my $read_seq = $fastq[1]; +chomp $read_seq; +is( length $read_seq, 54, 'Sequence trimmed to 54 bp' ); +my $read_qual = $fastq[3]; +chomp $read_qual; +is( length $read_qual, 54, 'Quality trimmed to 54 bp' ); + +@fastq = read_file( $tmp_dir . '/test1_XXXXXXXXXX_1.fastq' ); +my $read_name = $fastq[0]; +chomp $read_name; +is( substr( $read_name, -13 ), '#XXXXXXXXXX/1', 'Tag added to read name' ); +my $read_seq = $fastq[1]; +chomp $read_seq; +is( length $read_seq, 54, 'Sequence trimmed to 54 bp' ); +my $read_qual = $fastq[3]; +chomp $read_qual; +is( length $read_qual, 54, 'Quality trimmed to 54 bp' ); + +@fastq = read_file( $tmp_dir . '/test1_XXXXXXXXXX_2.fastq' ); +my $read_name = $fastq[0]; +chomp $read_name; +is( substr( $read_name, -13 ), '#XXXXXXXXXX/2', 'Tag added to read name' ); +my $read_seq = $fastq[1]; +chomp $read_seq; +is( length $read_seq, 54, 'Sequence trimmed to 54 bp' ); +my $read_qual = $fastq[3]; +chomp $read_qual; +is( length $read_qual, 54, 'Quality trimmed to 54 bp' ); + +# Check detagging and trimming FASTQ files +is( + detag_trim_fastq( + { + fastq_read1_input => 't/data/test2_1.fastq', + fastq_read2_input => 't/data/test2_2.fastq', + fastq_output_prefix => $tmp_dir . '/test2', + pre_detag_trim_length => 54, + polyt_trim_length => 14, + polyt_min_length => 10, + read_tags => [ 'NNNNBGAGGC', 'NNNNBAGAAG' ], + } + ), + undef, + 'Detag and trim FASTQ' +); + +my @fastq; +@fastq = read_file( $tmp_dir . '/test2_NNNNBGAGGC_1.fastq' ); +is( scalar @fastq / 4, 24, '30 read 1s' ); +@fastq = read_file( $tmp_dir . '/test2_NNNNBGAGGC_2.fastq' ); +is( scalar @fastq / 4, 24, '30 read 2s' ); +@fastq = read_file( $tmp_dir . '/test2_NNNNBAGAAG_1.fastq' ); +is( scalar @fastq / 4, 35, '24 read 1s' ); +@fastq = read_file( $tmp_dir . '/test2_NNNNBAGAAG_2.fastq' ); +is( scalar @fastq / 4, 35, '24 read 2s' ); +@fastq = read_file( $tmp_dir . '/test2_XXXXXXXXXX_1.fastq' ); +is( scalar @fastq / 4, 41, '46 read 1s' ); +@fastq = read_file( $tmp_dir . '/test2_XXXXXXXXXX_2.fastq' ); +is( scalar @fastq / 4, 41, '46 read 2s' ); + +throws_ok { + detag_trim_fastq( + { + fastq_read1_input => 't/data/test1_1.fastq', + fastq_read2_input => 't/data/test2_2.fastq', + fastq_output_prefix => $tmp_dir . '/test', + pre_detag_trim_length => 54, + polyt_trim_length => 14, + polyt_min_length => 10, + read_tags => [ 'NNNNBGAGGC', 'NNNNBAGAAG' ], + } + ); +} +qr/Read order does not match in input/ms, 'FASTQ files not matched'; + +# Check converting tags to regular expressions + +my @tags = qw( N B D H V R Y K M S W A G C T AA ); + +%re_for = convert_tag_to_regexp(@tags); + +ok( q{A} =~ $re_for{A}->[0], 'A matches A' ); +ok( q{G} !~ $re_for{A}->[0], 'A does not match G' ); +ok( q{C} !~ $re_for{A}->[0], 'A does not match C' ); +ok( q{T} !~ $re_for{A}->[0], 'A does not match T' ); +ok( q{N} !~ $re_for{A}->[0], 'A does not match N' ); + +ok( q{A} !~ $re_for{G}->[0], 'G does not match A' ); +ok( q{G} =~ $re_for{G}->[0], 'G matches G' ); +ok( q{C} !~ $re_for{G}->[0], 'G does not match C' ); +ok( q{T} !~ $re_for{G}->[0], 'G does not match T' ); +ok( q{N} !~ $re_for{G}->[0], 'G does not match N' ); + +ok( q{A} !~ $re_for{C}->[0], 'C does not match A' ); +ok( q{G} !~ $re_for{C}->[0], 'C does not match G' ); +ok( q{C} =~ $re_for{C}->[0], 'C matches C' ); +ok( q{T} !~ $re_for{C}->[0], 'C does not match T' ); +ok( q{N} !~ $re_for{C}->[0], 'C does not match N' ); + +ok( q{A} !~ $re_for{T}->[0], 'T does not match A' ); +ok( q{G} !~ $re_for{T}->[0], 'T does not match G' ); +ok( q{C} !~ $re_for{T}->[0], 'T does not match C' ); +ok( q{T} =~ $re_for{T}->[0], 'T matches T' ); +ok( q{N} !~ $re_for{T}->[0], 'T does not match N' ); + +ok( q{A} =~ $re_for{R}->[0], 'R matches A' ); +ok( q{G} =~ $re_for{R}->[0], 'R matches G' ); +ok( q{C} !~ $re_for{R}->[0], 'R does not match C' ); +ok( q{T} !~ $re_for{R}->[0], 'R does not match T' ); +ok( q{N} !~ $re_for{R}->[0], 'R does not match N' ); + +ok( q{A} !~ $re_for{Y}->[0], 'Y does not match A' ); +ok( q{G} !~ $re_for{Y}->[0], 'Y does not match G' ); +ok( q{C} =~ $re_for{Y}->[0], 'Y matches C' ); +ok( q{T} =~ $re_for{Y}->[0], 'Y matches T' ); +ok( q{N} !~ $re_for{Y}->[0], 'Y does not match N' ); + +ok( q{A} !~ $re_for{S}->[0], 'S does not match A' ); +ok( q{G} =~ $re_for{S}->[0], 'S matches G' ); +ok( q{C} =~ $re_for{S}->[0], 'S matches C' ); +ok( q{T} !~ $re_for{S}->[0], 'S does not match T' ); +ok( q{N} !~ $re_for{S}->[0], 'S does not match N' ); + +ok( q{A} =~ $re_for{W}->[0], 'W matches A' ); +ok( q{G} !~ $re_for{W}->[0], 'W does not match G' ); +ok( q{C} !~ $re_for{W}->[0], 'W does not match C' ); +ok( q{T} =~ $re_for{W}->[0], 'W matches T' ); +ok( q{N} !~ $re_for{W}->[0], 'W does not match N' ); + +ok( q{A} !~ $re_for{K}->[0], 'K does not match A' ); +ok( q{G} =~ $re_for{K}->[0], 'K matches G' ); +ok( q{C} !~ $re_for{K}->[0], 'K does not match C' ); +ok( q{T} =~ $re_for{K}->[0], 'K matches T' ); +ok( q{N} !~ $re_for{K}->[0], 'K does not match N' ); + +ok( q{A} =~ $re_for{M}->[0], 'M matches A' ); +ok( q{G} !~ $re_for{M}->[0], 'M does not match G' ); +ok( q{C} =~ $re_for{M}->[0], 'M matches C' ); +ok( q{T} !~ $re_for{M}->[0], 'M does not match T' ); +ok( q{N} !~ $re_for{M}->[0], 'M does not match N' ); + +ok( q{A} !~ $re_for{B}->[0], 'B does not match A' ); +ok( q{G} =~ $re_for{B}->[0], 'B matches G' ); +ok( q{C} =~ $re_for{B}->[0], 'B matches C' ); +ok( q{T} =~ $re_for{B}->[0], 'B matches T' ); +ok( q{N} !~ $re_for{B}->[0], 'B does not match N' ); + +ok( q{A} =~ $re_for{D}->[0], 'D matches A' ); +ok( q{G} =~ $re_for{D}->[0], 'D matches G' ); +ok( q{C} !~ $re_for{D}->[0], 'D does not match C' ); +ok( q{T} =~ $re_for{D}->[0], 'D matches T' ); +ok( q{N} !~ $re_for{D}->[0], 'D does not match N' ); + +ok( q{A} =~ $re_for{H}->[0], 'M matches A' ); +ok( q{G} !~ $re_for{H}->[0], 'M does not match G' ); +ok( q{C} =~ $re_for{H}->[0], 'M matches C' ); +ok( q{T} =~ $re_for{H}->[0], 'M matches T' ); +ok( q{N} !~ $re_for{H}->[0], 'M does not match N' ); + +ok( q{A} =~ $re_for{V}->[0], 'V matches A' ); +ok( q{G} =~ $re_for{V}->[0], 'V matches G' ); +ok( q{C} =~ $re_for{V}->[0], 'V matches C' ); +ok( q{T} !~ $re_for{V}->[0], 'V does not match T' ); +ok( q{N} !~ $re_for{V}->[0], 'V does not match N' ); + +ok( q{A} =~ $re_for{N}->[0], 'N matches A' ); +ok( q{G} =~ $re_for{N}->[0], 'N matches G' ); +ok( q{C} =~ $re_for{N}->[0], 'N matches C' ); +ok( q{T} =~ $re_for{N}->[0], 'N matches T' ); +ok( q{N} =~ $re_for{N}->[0], 'N matches N' ); + +ok( q{A} !~ $re_for{AA}->[0], 'AA does not match A' ); +ok( q{AA} =~ $re_for{AA}->[0], 'AA matches AA' ); diff --git a/t/pipeline-job.t b/t/pipeline-job.t new file mode 100644 index 0000000..3458a56 --- /dev/null +++ b/t/pipeline-job.t @@ -0,0 +1,102 @@ +use Test::More; +use Test::Exception; +use Test::Warn; +use Test::DatabaseRow; +use Test::MockObject; +use Carp; + +plan tests => 48; + +use DETCT::Pipeline::Job; + +# Mock stage objects with different names +my $stage1 = Test::MockObject->new(); +$stage1->set_isa('DETCT::Pipeline::Stage'); +$stage1->set_always( 'name', 'get_read_peaks' ); +my $stage2 = Test::MockObject->new(); +$stage2->set_isa('DETCT::Pipeline::Stage'); +$stage2->set_always( 'name', 'merge_read_peaks' ); + +my $job = DETCT::Pipeline::Job->new( + { + stage => $stage1, + component => 2, + scheduler => 'local', + base_filename => './run_deseq/1', + } +); + +isa_ok( $job, 'DETCT::Pipeline::Job' ); + +# Test stage attribute +is( $job->stage->name, 'get_read_peaks', 'Get stage' ); +is( $job->set_stage($stage2), undef, 'Set stage' ); +is( $job->stage->name, 'merge_read_peaks', 'Get new stage' ); +throws_ok { $job->set_stage() } qr/No stage specified/ms, 'No stage'; +throws_ok { $job->set_stage('invalid') } qr/Class of stage/ms, 'Invalid stage'; + +# Test component attribute +is( $job->component, 2, 'Get component' ); +is( $job->set_component(3), undef, 'Set component' ); +is( $job->component, 3, 'Get new component' ); +throws_ok { $job->set_component() } qr/No component specified/ms, + 'No component'; +throws_ok { $job->set_component(-1) } qr/Invalid component/ms, + 'Invalid component'; + +# Test scheduler attribute +is( $job->scheduler, 'local', 'Get scheduler' ); +is( $job->set_scheduler('lsf'), undef, 'Set scheduler' ); +is( $job->scheduler, 'lsf', 'Get new scheduler' ); +throws_ok { $job->set_scheduler() } qr/Invalid scheduler specified/ms, + 'No scheduler'; +throws_ok { $job->set_scheduler('invalid') } qr/Invalid scheduler specified/ms, + 'Invalid scheduler'; + +# Test base_filename attribute +is( $job->base_filename, './run_deseq/1', 'Get base filename' ); +is( $job->set_base_filename('./count_reads/2'), undef, 'Set base filename' ); +is( $job->base_filename, './count_reads/2', 'Get new base filename' ); +throws_ok { $job->set_base_filename() } qr/No base filename specified/ms, + 'No base filename'; +throws_ok { $job->set_base_filename('') } qr/No base filename specified/ms, + 'Empty base filename'; + +# Test parameters attribute +is( $job->parameters, undef, 'Get parameters' ); +is( $job->set_parameters('test'), undef, 'Set parameters' ); +is( $job->parameters, 'test', 'Get new parameters' ); +is( $job->set_parameters(), undef, 'Set undef parameters' ); +is( $job->parameters, undef, 'Get undef parameters' ); + +# Test memory attribute +is( $job->memory, undef, 'Get memory' ); +is( $job->set_memory(1000), undef, 'Set memory' ); +is( $job->memory, 1000, 'Get new memory' ); +throws_ok { $job->set_memory(-1) } qr/Invalid memory/ms, 'Invalid memory'; + +# Test retries attribute +is( $job->retries, undef, 'Get retries' ); +is( $job->set_retries(5), undef, 'Set retries' ); +is( $job->retries, 5, 'Get new retries' ); +throws_ok { $job->set_retries(-1) } qr/Invalid retries/ms, 'Invalid retries'; + +# Test status code attribute +is( $job->status_code, 'NOT_RUN', 'Get not run status code' ); +is( $job->set_status_code('DONE'), undef, 'Set done status code' ); +is( $job->status_code, 'DONE', 'Get done status code' ); +is( $job->set_status_code('RUNNING'), undef, 'Set running status code' ); +is( $job->status_code, 'RUNNING', 'Get new running status code' ); +is( $job->set_status_code('FAILED'), undef, 'Set failed status code' ); +is( $job->status_code, 'FAILED', 'Get new failed status code' ); +throws_ok { $job->set_status_code() } qr/No status code specified/ms, + 'No status code'; +throws_ok { $job->set_status_code('invalid') } qr/Invalid status code/ms, + 'Invalid status code'; + +# Test status text attribute +is( $job->status_text, undef, 'Get status text' ); +is( $job->set_status_text('Job killed by owner'), undef, 'Set status text' ); +is( $job->status_text, 'Job killed by owner', 'Get new status text' ); +is( $job->set_status_text(), undef, 'Set undef status text' ); +is( $job->status_text, undef, 'Get undef status text' ); diff --git a/t/pipeline-stage.t b/t/pipeline-stage.t new file mode 100644 index 0000000..56dfaff --- /dev/null +++ b/t/pipeline-stage.t @@ -0,0 +1,64 @@ +use Test::More; +use Test::Exception; +use Test::Warn; +use Test::DatabaseRow; +use Test::MockObject; +use Carp; + +plan tests => 22; + +use DETCT::Pipeline::Stage; + +my $stage = DETCT::Pipeline::Stage->new( + { + name => 'count_tags', + default_memory => 3000, + } +); + +isa_ok( $stage, 'DETCT::Pipeline::Stage' ); + +# Test name attribute +is( $stage->name, 'count_tags', 'Get name' ); +is( $stage->set_name('bin_reads'), undef, 'Set name' ); +is( $stage->name, 'bin_reads', 'Get new name' ); +throws_ok { $stage->set_name() } qr/No name specified/ms, 'No name'; +throws_ok { $stage->set_name('/') } qr/Invalid name/ms, 'Invalid name'; + +# Test default memory attribute +is( $stage->default_memory, 3000, 'Get default memory' ); +is( $stage->set_default_memory(2000), undef, 'Set default memory' ); +is( $stage->default_memory, 2000, 'Get new default memory' ); +throws_ok { $stage->set_default_memory() } qr/No default memory specified/ms, + 'No default memory'; +throws_ok { $stage->set_default_memory(-1) } qr/Invalid default memory/ms, + 'Invalid default memory'; + +# Test all jobs run attribute +is( $stage->all_jobs_run, 0, 'Get all jobs run' ); +is( $stage->set_all_jobs_run(10), undef, 'Set all jobs run to true' ); +is( $stage->all_jobs_run, 1, 'Get new true all jobs run' ); +is( $stage->set_all_jobs_run(), undef, 'Set all jobs run to false' ); +is( $stage->all_jobs_run, 0, 'Get new false all jobs run' ); + +# Mock stage object with different name +my $stage1 = Test::MockObject->new(); +$stage1->set_isa('DETCT::Pipeline::Stage'); +$stage1->set_always( 'name', 'get_read_peaks' ); +my $stage2 = Test::MockObject->new(); +$stage2->set_isa('DETCT::Pipeline::Stage'); +$stage2->set_always( 'name', 'merge_read_peaks' ); + +# Test adding and retrieving prerequisites +my $prerequisites; +$prerequisites = $stage->get_all_prerequisites(); +is( scalar @{$prerequisites}, 0, 'No prerequisites' ); +is( $stage->add_prerequisite($stage1), undef, 'Add prerequisite' ); +$prerequisites = $stage->get_all_prerequisites(); +is( scalar @{$prerequisites}, 1, 'Get one prerequisite' ); +$stage->add_prerequisite($stage2); +is( scalar @{$prerequisites}, 2, 'Get two prerequisites' ); +throws_ok { $stage->add_prerequisite() } qr/No prerequisite specified/ms, + 'No prerequisite specified'; +throws_ok { $stage->add_prerequisite('invalid') } qr/Class of prerequisite/ms, + 'Invalid prerequisite'; diff --git a/t/sample.t b/t/sample.t new file mode 100644 index 0000000..16b3bc4 --- /dev/null +++ b/t/sample.t @@ -0,0 +1,81 @@ +use Test::More; +use Test::Exception; +use Test::Warn; +use Test::DatabaseRow; +use Test::MockObject; +use Carp; + +plan tests => 36; + +use DETCT::Sample; + +my $sample = DETCT::Sample->new( + { + name => 'zmp_ph1_1m', + condition => 'mutant', + group => '1', + tag => 'NNNNBGAGGC', + bam_file => 't/data/test1.bam', + } +); + +isa_ok( $sample, 'DETCT::Sample' ); + +# Test name attribute +is( $sample->name, 'zmp_ph1_1m', 'Get name' ); +is( $sample->set_name('zmp_ph1_1s'), undef, 'Set name' ); +is( $sample->name, 'zmp_ph1_1s', 'Get new name' ); +throws_ok { $sample->set_name() } qr/No name specified/ms, 'No name'; +my $long_name = 'X' x ( $DETCT::Sample::MAX_NAME_LENGTH + 1 ); +throws_ok { $sample->set_name(' ') } qr/Invalid name specified/ms, + 'Invalid name'; +throws_ok { $sample->set_name('') } qr/Empty name specified/ms, 'Empty name'; +throws_ok { $sample->set_name($long_name) } qr/longer than \d+ characters/ms, + 'Long name'; + +# Test description attribute +is( $sample->description, undef, 'Get description' ); +is( $sample->set_description('ZMP phenotype 1.1 mutant'), + undef, 'Set description' ); +is( $sample->description, 'ZMP phenotype 1.1 mutant', 'Get new description' ); +is( $sample->set_description(), undef, 'Set undef description' ); +is( $sample->description, undef, 'Get undef description' ); + +# Test condition attribute +is( $sample->condition, 'mutant', 'Get condition' ); +is( $sample->set_condition('sibling'), undef, 'Set condition' ); +is( $sample->condition, 'sibling', 'Get new condition' ); +throws_ok { $sample->set_condition() } qr/No condition specified/ms, + 'No condition'; +my $long_condition = 'X' x ( $DETCT::Sample::MAX_CONDITION_LENGTH + 1 ); +throws_ok { $sample->set_condition('') } qr/Empty condition specified/ms, + 'Empty condition'; +throws_ok { $sample->set_condition($long_condition) } +qr/longer than \d+ characters/ms, 'Long condition'; + +# Test group attribute +is( $sample->group, '1', 'Get group' ); +is( $sample->set_group('2'), undef, 'Set group' ); +is( $sample->group, '2', 'Get new group' ); +is( $sample->set_group(), undef, 'Set undefined group' ); +is( $sample->group, undef, 'Get undefined group' ); +my $long_group = 'X' x ( $DETCT::Sample::MAX_GROUP_LENGTH + 1 ); +throws_ok { $sample->set_group('') } qr/Empty group specified/ms, 'Empty group'; +throws_ok { $sample->set_group($long_group) } qr/longer than \d+ characters/ms, + 'Long group'; + +# Test tag attribute +is( $sample->tag, 'NNNNBGAGGC', 'Get tag' ); +is( $sample->set_tag('NNNNBCAGAG'), undef, 'Set tag' ); +is( $sample->tag, 'NNNNBCAGAG', 'Get new tag' ); +throws_ok { $sample->set_tag() } qr/No tag specified/ms, 'No tag'; +throws_ok { $sample->set_tag('NNNNBCAGAN') } qr/Invalid tag/ms, 'Invalid tag'; + +# Test bam file attribute +is( $sample->bam_file, 't/data/test1.bam', 'Get BAM file' ); +is( $sample->set_bam_file('t/data/test2.bam'), undef, 'Set BAM file' ); +is( $sample->bam_file, 't/data/test2.bam', 'Get new BAM file' ); +throws_ok { $sample->set_bam_file() } qr/No BAM file specified/ms, + 'No BAM file'; +throws_ok { $sample->set_bam_file('nonexistent.bam') } +qr/does not exist or cannot be read/ms, 'Missing BAM file'; diff --git a/t/sequence.t b/t/sequence.t new file mode 100644 index 0000000..7aa985d --- /dev/null +++ b/t/sequence.t @@ -0,0 +1,36 @@ +use Test::More; +use Test::Exception; +use Test::Warn; +use Test::DatabaseRow; +use Test::MockObject; +use Carp; + +plan tests => 12; + +use DETCT::Sequence; + +my $sequence = DETCT::Sequence->new( + { + name => '1', + bp => 60_348_388, + } +); + +isa_ok( $sequence, 'DETCT::Sequence' ); + +# Test name attribute +is( $sequence->name, '1', 'Get name' ); +is( $sequence->set_name('2'), undef, 'Set name' ); +is( $sequence->name, '2', 'Get new name' ); +throws_ok { $sequence->set_name() } qr/No name specified/ms, 'No name'; +my $long_name = 'X' x ( $DETCT::Sequence::MAX_NAME_LENGTH + 1 ); +throws_ok { $sequence->set_name('') } qr/Empty name specified/ms, 'Empty name'; +throws_ok { $sequence->set_name($long_name) } qr/longer than \d+ characters/ms, + 'Long name'; + +# Test bp attribute +is( $sequence->bp, 60_348_388, 'Get bp' ); +is( $sequence->set_bp(60_300_536), undef, 'Set bp' ); +is( $sequence->bp, 60_300_536, 'Get new bp' ); +throws_ok { $sequence->set_bp() } qr/No bp specified/ms, 'No bp'; +throws_ok { $sequence->set_bp(-1) } qr/Invalid bp/ms, 'Invalid bp'; diff --git a/t/transcript.t b/t/transcript.t new file mode 100644 index 0000000..87fbca3 --- /dev/null +++ b/t/transcript.t @@ -0,0 +1,102 @@ +use Test::More; +use Test::Exception; +use Test::Warn; +use Test::DatabaseRow; +use Test::MockObject; +use Carp; + +plan tests => 47; + +use DETCT::Transcript; + +my $transcript = DETCT::Transcript->new( + { + stable_id => 'ENSDART00000133571', + biotype => 'protein_coding', + seq_name => '5', + start => 40352744, + end => 40354399, + strand => 1, + } +); + +isa_ok( $transcript, 'DETCT::Transcript' ); + +# Test stable id attribute +is( $transcript->stable_id, 'ENSDART00000133571', 'Get stable id' ); +is( $transcript->set_stable_id('ENSDART00000033574'), undef, 'Set stable id' ); +is( $transcript->stable_id, 'ENSDART00000033574', 'Get new stable id' ); +throws_ok { $transcript->set_stable_id() } qr/No stable id specified/ms, + 'No stable id'; +throws_ok { $transcript->set_stable_id('#invalid#') } qr/Invalid stable id/ms, + 'Invalid stable id'; + +# Test name attribute +is( $transcript->name, undef, 'Get name' ); +is( $transcript->set_name('cxc64-001'), undef, 'Set name' ); +is( $transcript->name, 'cxc64-001', 'Get new name' ); +is( $transcript->set_name(), undef, 'Set undef name' ); +is( $transcript->name, undef, 'Get undef name' ); +my $long_name = 'X' x ( $DETCT::Transcript::MAX_NAME_LENGTH + 1 ); +throws_ok { $transcript->set_name('') } qr/Name is empty/ms, 'Empty name'; +throws_ok { $transcript->set_name($long_name) } +qr/longer than \d+ characters/ms, 'Invalid name'; + +# Test description attribute +is( $transcript->description, undef, 'Get description' ); +is( $transcript->set_description('CXC chemokine 64'), undef, + 'Set description' ); +is( $transcript->description, 'CXC chemokine 64', 'Get new description' ); +is( $transcript->set_description(), undef, 'Set undef description' ); +is( $transcript->description, undef, 'Get undef description' ); + +# Test biotype attribute +is( $transcript->biotype, 'protein_coding', 'Get biotype' ); +is( $transcript->set_biotype('nonsense_mediated_decay'), undef, 'Set biotype' ); +is( $transcript->biotype, 'nonsense_mediated_decay', 'Get new biotype' ); +throws_ok { $transcript->set_biotype() } qr/No biotype specified/ms, + 'No biotype'; +throws_ok { $transcript->set_biotype('#invalid#') } qr/Invalid biotype/ms, + 'Invalid biotype'; + +# Test sequence name attribute +is( $transcript->seq_name, '5', 'Get sequence name' ); +is( $transcript->set_seq_name('6'), undef, 'Set sequence name' ); +is( $transcript->seq_name, '6', 'Get new sequence name' ); +throws_ok { $transcript->set_seq_name() } qr/No sequence name specified/ms, + 'No sequence name'; +throws_ok { $transcript->set_seq_name('#invalid#') } +qr/Invalid sequence name/ms, 'Invalid sequence name'; + +# Test start attribute +is( $transcript->start, 40352744, 'Get start' ); +is( $transcript->set_start(30352744), undef, 'Set start' ); +is( $transcript->start, 30352744, 'Get new start' ); +throws_ok { $transcript->set_start() } qr/No start specified/ms, 'No start'; +throws_ok { $transcript->set_start(-1) } qr/Invalid start/ms, 'Invalid start'; + +# Test end attribute +is( $transcript->end, 40354399, 'Get end' ); +is( $transcript->set_end(30354399), undef, 'Set end' ); +is( $transcript->end, 30354399, 'Get new end' ); +throws_ok { $transcript->set_end() } qr/No end specified/ms, 'No end'; +throws_ok { $transcript->set_end(-2) } qr/Invalid end/ms, 'Invalid end'; + +# Test strand attribute +is( $transcript->strand, 1, 'Get strand' ); +is( $transcript->set_strand(-1), undef, 'Set strand' ); +is( $transcript->strand, -1, 'Get new strand' ); +throws_ok { $transcript->set_strand() } qr/No strand specified/ms, 'No strand'; +throws_ok { $transcript->set_strand(0) } qr/Invalid strand/ms, 'Invalid strand'; + +# Mock gene object +my $gene = Test::MockObject->new(); +$gene->set_isa('DETCT::Gene'); +$gene->set_always( 'name', 'cxc64' ); + +# Test gene attribute +is( $transcript->gene, undef, 'Get gene' ); +is( $transcript->set_gene($gene), undef, 'Set gene' ); +is( $transcript->gene->name, 'cxc64', 'Get new gene' ); +throws_ok { $transcript->set_gene('invalid') } qr/Class of gene/ms, + 'Invalid gene';