=head1 NAME DAS::GUS::Segment - DAS-style access to a GUS database =head1 SYNOPSIS # Get a Bio::Das::SegmentI object from a DAS::GUS database $segment = $das->segment( -name => 'Landmark', -start => $start, -stop => $stop ); =head1 AUTHOR Name: Haiming Wang Email: hwang@uga.edu =cut package DAS::GUS::Segment; use strict; use Bio::Root::Root; use Bio::Das::SegmentI; use DAS::GUS::Segment::Feature; use constant DEBUG => 1; use vars '@ISA', '$VERSION'; @ISA = qw(Bio::Root::Root Bio::SeqI Bio::Das::SegmentI); $VERSION = 0.11; use overload '""' => 'asString'; our $dlm = ";;"; # separate attributes $tag=$value pairs =head2 new Title : new Usage : $segment = $db->segment(-name => 'AAEL01000015', -start => $start, -stop => $stop ); Function: Create a segment object Returns : a new DAS::GUS::Segment object Args : see below This method creates a new DAS::GUS::Segment object accoring to a segment name, such as contig 'AAEL0100015'. Generally this is called automatically by the DAS::GUS module. There are five positional arguments: $factory a DAS::GUS adaptor to use for database access $start start of the desired segment relative to source sequence $stop stop of the desired segment relative to source sequence $srcfeature_id ID of the source sequence $class type of the sequence, i.e. chromosome, contig $name name of the segment $atts attributes of the segment =cut sub new { my $self = shift; my ( $name, $factory, $start, $stop, $atts ) = @_; my $query = $factory->parser->getSQL("Segment.pm", "new:Segment"); die "Couldn't find Segment.pm sql for new:Segment\n" unless $query; $query =~ s/(\$\w+)/eval $1/eg; my $sth = $factory->dbh->prepare($query); $sth->execute(); my $hashref = $sth->fetchrow_hashref; warn "END or STARTM of $name could not be determined by sql: $query\n" unless exists $$hashref{'END'} && exists $$hashref{'STARTM'}; my $length = $$hashref{'END'} - $$hashref{'STARTM'} + 1; $stop = ($stop && ($stop < $length)) ? int($stop) : $length; $start = ($start && ($start > 0)) ? int($start) : 1; return bless { factory => $factory, start => $start, end => $stop, srcfeature_id => $$hashref{'SRCFEATURE_ID'}, length => $length, class => $$hashref{'TYPE'}, name => $$hashref{'NAME'}, atts => $$hashref{'ATTS'}, }, ref $self || $self; } =head2 name Title : name Usage : $segname = $seg->name(); Function : Return the name of the segment Returns : see avove Args : none Status : public =cut sub name { my $self = shift; return $self->{'name'} = shift if @_; return $self->{'name'}; } =head2 class Title : class Usage : $obj->class($newval) Function: Return the segment class Returns : value of class (a scalar) Args : on set, new value (a scalar or undef, optional) =cut sub class { my $self = shift; return $self->{'class'} = shift if @_; return $self->{'class'}; } *type = \&class; =head2 attributes Title : attributes Usage : $obj->attributes($newval) Function: Return the segment attributes Returns : attributes string for gff3 dump Args : =cut sub attributes { my $self = shift; return $self->{'atts'} = shift if @_; return $self->{'atts'}; } =head2 seq_id Title : seq_id Usage : $ref = $s->seq_id Function: return the ID of the landmark, aliased to name() for backward compatibility Return : a string Args : none Status : public =cut *seq_id = \&name; =head2 start Title : start Usage : $s->start Function: start of segment Returns : integer Args : none Status : Public =cut sub start { my $self = shift; return $self->{'start'} = shift if @_; return $self->{'start'}; } =head2 low Title : low Usage : $s->low Function: start of segment; Alias of start for backward compatibility Returns : integer Args : none Status : Public =cut *low = \&start; =head2 end Title : end Usage : $s->end Function: end of segment; Returns : integer Args : none Status : Public =cut sub end { my $self = shift; return $self->{'end'} = shift if @_; return $self->{'end'}; } =head2 high Title : high Usage : $s->high Function: end of segment; Alias of end for backward compatibility. Returns : integer Args : none Status : Public =cut *high = \&end; =head2 stop Title : stop Usage : $s->stop Function: end of segment; Alias of end for backward compatibility. Returns : integer Args : none Status : Public =cut *stop = \&end; =head2 length Title : length Usage : $s->length Function: length of segment; Returns : integer Args : none Status : Public =cut sub length { #shift->{length}; abs($_[0]->{start} - $_[0]->{end}) + 1 } =head2 features Title : features Usage : @features = $s->features(@args) Function: get features that overlap this segment Returns : a list of Bio::SeqFeatureI objects Args : see below Status : public This method will find all features that intersect the segment in a variety of ways and returns a list of Bio::SeqFeatureI objects. The feature locations will use coordinates relative to the reference sequence in effect at the time that features() was called. The returned list can be limited to certain types, attributes or range intersection modes. Types of range intersection are one of: "overlaps" the default "contains" return features completely contained within the segment "contained_in" retunr features that completely contain the segment Two types of argument lists are accepts. In the positional argument form, the arguments are treated as a list of feature types. In the named parameter form, the arguments are a series of -name=Evalue pairs. Argument Description ------------------------------------- -types An array reference to type names in the format "method:source" -attributes A hashref containing a set of attributes to match -rangetype One of "overlaps", "contains", or "contained_in". -iterator Return an iterator across the features. -callback A callback to invoke on each feature The -attributes argument is a hashref containing one or more attributes to match against: -attributes => { Gene => 'abc-1', Note => 'confirmed' } Attribute matching is simple string matching, and multiple attributes are ANDed together. More complex filtering can be performed using the -callback option (see below) If -iterator is true, then the method returns an object reference that implements the next_seq() method. Each call to next_seq() returns a new Bio::SeqFeatureI object. If -callback is passed a code reference, the code reference will be invoked on each feture returned. The code will be passed two arguments consisting of the current feature and the segment object itself, and must return a true value. If the code returns a false value, feature retrieval will be aborted. -callback and -iterator are mutually exclusive options. If -iterator is defined, then -callback is ignored. =cut sub features { my $self = shift; my ($sql, @features, $base_start, $rend); my ($type, $types,$attributes,$rangetype,$iterator,$callback,$start, $stop,$feature_id,$factory) = $self->_rearrange([qw(TYPE TYPES ATTRIBUTES RANGETYPE ITERATOR CALLBACK START STOP FEATURE_ID FACTORY ) ], @_); $types ||= $type; my $srcfeature_id = $self->srcfeature_id; my $segname = $self->name; $base_start = $self->start; $rend = $self->end; $factory ||= $self->factory; ########################################################### # # You can write queries from here to retrieve TOP level # features (such as gene, blast...) from GUS. # # NOTE: a query name MUST follow the format in the conf file, # for example: in conf file, a gene track like, # # [Gene] # feature = gene:Genbank # # Your gene query name MUST be gene:Genbank # ########################################################### foreach my $typeHash ( _getUniqueTypes($types) ) { my @types = keys(%$typeHash); my $type = shift @types; my $typeString = $typeHash->{$type}; $sql = $factory->parser->getSQL("Segment.pm", $type); warn "Couldn't find Segment.pm sql for $type\n" unless $sql; next unless $sql; $sql =~ s/(\$\w+)/eval $1/eg; my $sth = $factory->dbh->prepare($sql); $sth->execute() or $self->throw("getting feature query failed"); my @tempfeats = (); while (my $featureRow = $sth->fetchrow_hashref) { push @tempfeats, $self->_makeFeature($featureRow, $factory); } # filter out blastx. it is used by cryptodb if($typeString =~ /blastx/i) { warn "filtering blastx results for feature type: $type"; @tempfeats = _blastx_filter(\@tempfeats); } push(@features, @tempfeats); my $bulkSubFeatureSql = $factory->parser->getSQL("Feature.pm", "$type:bulksubfeatures"); if($bulkSubFeatureSql) { $bulkSubFeatureSql =~ s/(\$\w+)/eval $1/eg; $self->_addBulkSubFeatures(\@features, $bulkSubFeatureSql, $factory) } my $bulkAttributeSql = $factory->parser->getSQL("Feature.pm", "$type:bulkAttribute"); next unless $bulkAttributeSql; $bulkAttributeSql =~ s/(\$\w+)/eval $1/eg; $self->_addBulkAttribute(\@features, $bulkAttributeSql, $factory); } if($iterator) { return DAS::GUSIterator->new(\@features); } elsif ( wantarray ) { return @features; } else { return \@features; } } sub _addBulkAttribute { my($self, $features, $bulkAttributeSql, $factory) = @_; my %featuresById; map { $featuresById{$_->feature_id} = $_ } @$features; my $sth = $factory->dbh->prepare($bulkAttributeSql); $sth->execute() or $self->throw("getting bulk attribute query failed"); my @bulkAtts; while (my $featureRow = $sth->fetchrow_hashref) { my $feature = $featuresById{$$featureRow{'FEATURE_ID'}}; if ($feature) { $feature->bulkAttributes($featureRow); } } } sub _addBulkSubFeatures { my ($self, $features, $subFeatureSql, $factory) = @_; my %featuresById; map { $featuresById{$_->feature_id} = $_ } @$features; my $sth = $factory->dbh->prepare($subFeatureSql); $sth->execute() or $self->throw("getting bulk subfeature query failed"); while (my $featureRow = $sth->fetchrow_hashref) { my $feature = $featuresById{$$featureRow{'PARENT_ID'}}; if ($feature) { $feature->_addSubFeatureFromRow($featureRow); } else { $self->warn("sub feature [" . $$featureRow{'FEATURE_ID'} . "]'s parent feature [" . $$featureRow{'PARENT_ID'} . "] could not be found. bulk subfeature query is:\n" . $subFeatureSql); } } } =head2 _getUniqueTypes Title : _getUniqueTypes Usage : $segment->_getUniqueTypes() Function: filter out the duplicate types and return an array of unique feature types in 'type_source' format Returns : an array of feature types Args : array ref Status : Private For example, in the config file [Gene] feature = gene:Genbank [BLASTX] feature = match:WU_BLASTX _getUniqeTypes will subsitute ':' with '_' and return [ gene_Genbank, match_WU_BLASTX ], the latter can be used to find the corresponding element in SQL xml files. =cut sub _getUniqueTypes() { my $types = shift; my @uniqtypes = (); my %seen; for my $type (@{$types || []}) { push(@uniqtypes, { $type => $type }) unless $seen{$type}++; } return @uniqtypes; } sub _makeFeature() { my ($self, $featureRow, $factory) = @_; my $type = $$featureRow{'TYPE'}; my $source = $$featureRow{'SOURCE'}; my $feature_id = $$featureRow{'FEATURE_ID'}; my $unique_name = "$type.$feature_id"; $type .= ":$source"; my $feat = DAS::GUS::Segment::Feature->new( $factory, $self, # parent $self->seq_id, $$featureRow{'STARTM'}, # start $$featureRow{'END'}, # end $$featureRow{'TYPE'}.':'.$$featureRow{'SOURCE'}, # type $$featureRow{'SCORE'}, # score $$featureRow{'STRAND'}, # strand $$featureRow{'PHASE'}, # phase $$featureRow{'NAME'}, # group $$featureRow{'ATTS'}, # attributes $unique_name, $$featureRow{'FEATURE_ID'}, # feature_id ); $feat->source($source); # if this is mRNA .. then get the exons .... # depending on the type, build sub_feature here $feat; } # filter out the blastx output. First, sort the blastx data using sql # by e-value, match length and start. # Second, filter features with overlap > 5 sub _blastx_filter { my $feats = shift; my $counter = 0; #my $idx = -1; my $old_end = 2000000; my @newfeats = (); foreach my $f(@$feats) { my $name = $f->name; #$idx = $idx + 1; my $start = $f->start; my $end = $f->end; if($start <= $old_end) { $counter = $counter + 1; # find one; } else { $old_end = $end; push(@newfeats, $f); $counter = 0; next; } if($counter >= 5) { #splice(@$feats, $idx, 1); #$idx = $idx - 1; # reset index } else { push(@newfeats, $f); $old_end = $end; } } return @newfeats; } =head2 get_all_SeqFeature, get_SeqFeatures, top_SeqFeatures, all_SeqFeatures Title : get_all_SeqFeature, get_SeqFeatures, top_SeqFeatures, all_SeqFeatures Usage : $s->get_all_SeqFeature() Function: get the sequence string fro this segment Several aliases of features() for backword compatibility Returns : a string Args : none Status : Public =cut *get_all_SeqFeature = *get_SeqFeatures = *top_SeqFeatures = *all_SeqFeatures = \&features; =head2 seq Title : seq Usage : $s->seq Function: get the sequence string for this segment Returns : a string Args : none Status : Public =cut sub seq { my $self = shift; return $self->{'seq'} = shift if @_; return $self->{'seq'} if( $self->{'seq'}); my ($ref, $class, $base_start, $stop, $strand) = @{$self}{qw(sourceseq class start end strand)}; my $srcfeature_id = $self->{srcfeature_id}; my $has_start = defined $base_start; my $has_stop = defined $stop; $strand ||= 0; my $reversed; if($has_start && $has_stop && ($base_start > $stop)) { $reversed++; ($base_start, $stop) = ($stop, $base_start); } elsif( $strand < 0) { $reversed++; } my $seqQuery = $self->factory->parser->getSQL("Segment.pm", "get_sequence"); warn "Couldn't find Segment.pm sql for get_sequence\n" unless $seqQuery; return unless $seqQuery; $seqQuery =~ s/(\$\w+)/eval $1/eg; my $sth = $self->factory->dbh->prepare($seqQuery); $sth->execute(); my ($seq) = $sth->fetchrow_array(); if (!$has_start && !$has_stop) { # do nothing, sequence is already complete } elsif (!$has_start) { $seq = substr($seq, 0, $stop - 1); } elsif(!$has_stop) { $seq = substr($seq, $base_start - 1); } else { # has both start and stop $seq = substr($seq, $base_start - 1, $stop - $base_start + 1); } if($reversed) { $seq = reverse $seq; $seq =~ tr/gatcGATC/ctagCTAG/; } return $seq; } *protein = *dna = \&seq; =head2 secondary_structure_encodings Title : secondary_structure_encodings Usage : $s->secondary_structure_encodings Function: get the secondary structure prediction scores for segment Returns : hash ref { secondary_structure_type => string of 0-9 one digit per base } Args : none Status : Public =cut sub secondary_structure_encodings { my $self = shift; my $srcfeature_id = $self->{srcfeature_id}; my $strucQuery = $self->factory->parser->getSQL("Segment.pm", "get_2d_struc"); warn "Couldn't find Segment.pm sql for get_2d_struc\n" unless $strucQuery; return unless $strucQuery; $strucQuery =~ s/(\$\w+)/eval $1/eg; my $sth = $self->factory->dbh->prepare($strucQuery); $sth->execute(); my $encodings = undef; while (my ($type, $encoding) = $sth->fetchrow_array()) { $encodings = {} unless defined($encodings); $type = 'helix' if $type =~ /^h$/i; $type = 'coil' if $type =~ /^c$/i; $type = 'strand' if $type =~ /^e$/i; $encodings->{$type} = $encoding; } unless ($encodings && $encodings->{helix}) { warn "no structure encodings retrieved by sql: $strucQuery\n"; } return $encodings; } =head2 factory Title : factory Usage : $factory = $s->factory Function: return the segment factory Returns : a Bio::DasI object Args : see below Status : Public This method returns a Bio::DasI object that can be used to fetch more segments. This is typically the Bio::DasI object from which the segments was originally generated. =cut sub factory { shift->{factory} } =head2 srcfeature_id Title : srcfeature_id Usage : $obj->srcfeature_id($newval) Function: Returns : value of srcfeature_id (a scalar) Args : on set, new value (a scalar or undef, optional) =cut sub srcfeature_id { my $self = shift; return $self->{'srcfeature_id'} = shift if @_; return $self->{'srcfeature_id'}; } =head2 alphabet Title : alphabet Usage : $obj->alphabet($newval) Function: Returns : scalar 'dna' Args : on set, new value ( a scalar or undef, optional ) Status : Public =cut sub alphabet { return 'dna'; } =head2 display_id, display_name, accession_number, desc Title : display_id, display_name, accession_number, desc Usage : $s->display_name() Function: Alias of name() Several aliases for name; it may be that these could do something better than just giving back the name. Returns : string Args : none Status : Public =cut *display_id = *display_name = *accession_number = *desc = \&name; =head2 get_feature_stream Title : get_feature_stream Usage : Function: Returns : Args : Status : =cut sub get_feature_stream { my $self = shift; my @args = @_; my $features = $self->features(@args); return DAS::GUSIterator->new($features); } sub get_seq_stream { my @features = shift->features(@_); return DAS::GUSIterator->new(\@features); } =head2 clone Title : clone Usage : $copy = $s->clone Function: make a copy of this segment Returns : a Bio::DB::GFF::Segment object Args : none Status : Public =cut sub clone { my $self = shift; my %h = %$self; return bless \%h, ref($self); } =head2 sourceseq Title : sourceseq Usage : $obj->sourceseq($newval) Function: get feature name according to a feature_id Returns : value of sourceseq(a scalar) Args : on set, new value ( a scalar or undef, optional ) Status : Public =cut sub sourceseq { my $self = shift; return $self->name; } =head2 abs_ref Title : abs_ref Usage : $obj->abs_ref() Function: Alisas of sourceseq Alias of sourceseq for backward compatibility Returns : value of sourceseq ( a scalar ) Args : none Status : Public =cut *abs_ref = \&sourceseq; =head2 abs_start Title : abs_start Usage : $obj->abs_start() Function: Alias of start Returns : value of start ( a scalar ) Args : none Status : Public =cut *abs_start = \&start; =head2 abs_end Title : abs_end Usage : $obj->abs_end() Function: Alias of end Returns : value of end ( a scalar ) Args : none Status : Public =cut *abs_end = \&end; =head2 asString Title : asString Usage : $s->asString Function: human-readable string for segment Returns a human-readable string representing this sequence. Format is: sourceseq:start,stop Returns : a string Args : none Status : Public =cut sub asString { my $self = shift; my $label = $self->refseq; my $start = $self->start; my $stop = $self->stop; return "$label:$start,$stop"; } # implement SeqI abstract method. # required by BatchDumper plugin. don't yet know what, if anything, needs # to happen here. This is sufficient at the moment to quash execptions. # -mheiges sub primary_seq { my $self = shift; } 1;