Class Bio::FastaDefline
In: lib/bio/db/fasta/defline.rb
Parent: Object

Parsing FASTA Defline, and extract IDs and other informations. IDs are NSIDs (NCBI standard FASTA sequence identifiers) or ":"-separated IDs.

specs are described in: ftp.ncbi.nih.gov/blast/documents/README.formatdb blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers

Examples

  rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
  rub.entry_id       ==> 'gi|671595'
  rub.get('emb')     ==> 'CAA85678.1'
  rub.emb            ==> 'CAA85678.1'
  rub.gi             ==> '671595'
  rub.accession      ==> 'CAA85678'
  rub.accessions     ==> [ 'CAA85678' ]
  rub.acc_version    ==> 'CAA85678.1'
  rub.locus          ==> nil
  rub.list_ids       ==> [["gi", "671595"],
                          ["emb", "CAA85678.1", nil],
                          ["Perovskia abrotanoides"]]

  ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
  ckr.entry_id      ==> "gi|2495000"
  ckr.sp            ==> "CCKR_CAVPO"
  ckr.pir           ==> "I51898"
  ckr.gb            ==> "AAB29504.1"
  ckr.gi            ==> "2495000"
  ckr.accession     ==> "AAB29504"
  ckr.accessions    ==> ["Q63931", "AAB29504"]
  ckr.acc_version   ==> "AAB29504.1"
  ckr.locus         ==> nil
  ckr.description   ==>
    "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
  ckr.descriptions  ==>
    ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
     "cholecystokinin A receptor - guinea pig",
     "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
  ckr.words         ==>
    ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
     "receptor", "type"]
  ckr.id_strings    ==>
    ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
     "544724", "AAB29504.1", "Cavia"]
  ckr.list_ids      ==>
    [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
     ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
     ["gb", "AAB29504.1", nil], ["Cavia"]]

Refereneces

Methods

Constants

NSIDs = { # NCBI and WU-BLAST 'gi' => [ 'gi' ], # NCBI GI 'gb' => [ 'acc_version', 'locus' ], # GenBank 'emb' => [ 'acc_version', 'locus' ], # EMBL 'dbj' => [ 'acc_version', 'locus' ], # DDBJ 'sp' => [ 'accession', 'entry_id' ], # SWISS-PROT 'tr' => [ 'accession', 'entry_id' ], # TREMBL 'pdb' => [ 'entry_id', 'chain' ], # PDB 'bbs' => [ 'number' ], # GenInfo Backbone Id 'gnl' => [ 'database' , 'entry_id' ], # General database identifier 'ref' => [ 'acc_version' , 'locus' ], # NCBI Reference Sequence 'lcl' => [ 'entry_id' ], # Local Sequence identifier # WU-BLAST and NCBI 'pir' => [ 'accession', 'entry_id' ], # PIR 'prf' => [ 'accession', 'entry_id' ], # Protein Research Foundation 'pat' => [ 'country', 'number', 'serial' ], # Patents # WU-BLAST only 'bbm' => [ 'number' ], # NCBI GenInfo Backbone database identifier 'gim' => [ 'number' ], # NCBI GenInfo Import identifier 'gp' => [ 'acc_version', 'locus' ], # GenPept 'oth' => [ 'accession', 'name', 'release' ], # Other (user-definable) identifier 'tpd' => [ 'accession', 'name' ], # Third party annotation, DDBJ 'tpe' => [ 'accession', 'name' ], # Third party annotation, EMBL 'tpg' => [ 'accession', 'name' ], # Third party annotation, GenBank # Original 'ri' => [ 'entry_id', 'rearray_id', 'len' ], # RIKEN FANTOM DB }
KillWords = [ 'an', 'the', 'this', 'that', 'is', 'are', 'were', 'was', 'be', 'can', 'may', 'might', 'as', 'at', 'by', 'for', 'in', 'of', 'on', 'to', 'with', 'from', 'and', 'or', 'not', 'dna', 'rna', 'mrna', 'cdna', 'orf', 'aa', 'nt', 'pct', 'id', 'ec', 'sp', 'subsp', 'similar', 'involved', 'identical', 'identity', 'cds', 'clone', 'library', 'contig', 'contigs', 'homolog', 'homologue', 'homologs', 'homologous', 'protein', 'proteins', 'gene', 'genes', 'product', 'products', 'sequence', 'sequences', 'strain', 'strains', 'region', 'regions', ]
KillWordsHash = {}
KillRegexpArray = [ /\A\d{1,3}\%?\z/, /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/, /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/

Attributes

entry_id  [R]  Shows a possibly unique identifier. Returns a string.
list_ids  [R]  Shows array that contains IDs (or ID-like strings). Returns an array of arrays of strings.

Public Class methods

Parses given string.

[Source]

     # File lib/bio/db/fasta/defline.rb, line 181
181:     def initialize(str)
182:       @deflines = []
183:       @info = {}
184:       @list_ids = []
185: 
186:       @entry_id = nil
187: 
188:       lines = str.split("\x01")
189:       lines.each do |line|
190:         add_defline(line)
191:       end
192:     end

Public Instance methods

Shows accession with version number. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.

[Source]

     # File lib/bio/db/fasta/defline.rb, line 494
494:     def acc_version
495:       unless defined?(@acc_version) then
496:         @acc_version = get_by_type('acc_version')
497:       end
498:       @acc_version
499:     end

Shows an accession number.

[Source]

     # File lib/bio/db/fasta/defline.rb, line 512
512:     def accession
513:       unless defined?(@accession) then
514:         if acc_version then
515:           @accession = acc_version.split('.')[0]
516:         else
517:           @accession = accessions[0]
518:         end
519:       end
520:       @accession
521:     end

Shows accession numbers. Returns an array of strings.

[Source]

     # File lib/bio/db/fasta/defline.rb, line 503
503:     def accessions
504:       unless defined?(@accessions) then
505:         @accessions = get_all_by_type('accession', 'acc_version')
506:         @accessions.collect! { |x| x.sub(/\..*\z/, '') }
507:       end
508:       @accessions
509:     end

Parses given string and adds parsed data.

[Source]

     # File lib/bio/db/fasta/defline.rb, line 195
195:     def add_defline(str)
196:       case str
197:       when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/
198:         # NSIDs
199:         # examples:
200:         # >gi|9910844|sp|Q9UWG2|RL3_METVA 50S ribosomal protein L3P
201:         #
202:         # note: regexp (:?) means grouping without backreferences
203:         i = $1
204:         d = $2
205:         tks = i.split('|')
206:         tks << '' if i[-1,1] == '|'
207:         a = parse_NSIDs(tks)
208:         i = a[0].join('|')
209:         a.unshift('|')
210:         d = tks.join('|') + ' ' + d unless tks.empty?
211:         a << d
212:         this_line = a
213:         match_EC(d)
214:         parse_square_brackets(d).each do |x|
215:           if !match_EC(x, false) and x =~ /\A[A-Z]/ then
216:             di = [  x ]
217:             @list_ids << di
218:             @info['organism'] = x unless @info['organism']
219:           end
220:         end
221: 
222:       when /^\>?\s*([a-zA-Z0-9]+\:[^\s]+)\s*(.*)$/
223:         # examples:
224:         # >sce:YBR160W  CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
225:         # >emb:CACDC28 [X80034] C.albicans CDC28 gene 
226:         i = $1
227:         d = $2
228:         a = parse_ColonSepID(i)
229:         i = a.join(':')
230:         this_line = [ ':', a , d ]
231:         match_EC(d)
232:         parse_square_brackets(d).each do |x|
233:           if !match_EC(x, false) and x =~ /:/ then
234:             parse_ColonSepID(x)
235:           elsif x =~ /\A\s*([A-Z][A-Z0-9_\.]+)\s*\z/ then
236:             @list_ids << [ $1 ]
237:           end
238:         end
239: 
240:       when /^\>?\s*(\S+)(?:\s+(.+))?$/
241:         # examples:
242:         # >ABC12345 this is test
243:         i = $1
244:         d = $2.to_s
245:         @list_ids << [ i.chomp('.') ]
246:         this_line = [  '', [ i ], d ]
247:         match_EC(d)
248:       else
249:         i = str
250:         d = ''
251:         match_EC(i)
252:         this_line = [ '', [ i ], d ]
253:       end
254: 
255:       @deflines << this_line
256:       @entry_id = i unless @entry_id
257:     end

Shows description.

[Source]

     # File lib/bio/db/fasta/defline.rb, line 337
337:     def description
338:       @deflines[0].to_a[-1]
339:     end

Returns descriptions.

[Source]

     # File lib/bio/db/fasta/defline.rb, line 342
342:     def descriptions
343:       @deflines.collect do |a|
344:         a[-1]
345:       end
346:     end

Returns identifires by a database name.

[Source]

     # File lib/bio/db/fasta/defline.rb, line 418
418:     def get(dbname)
419:       db = dbname.to_s
420:       r = nil
421:       unless r = @info[db] then
422:         di = @list_ids.find { |x| x[0] == db.to_s }
423:         if di and di.size <= 2 then
424:           r = di[-1]
425:         elsif di then
426:           labels = self.class::NSIDs[db]
427:           [ 'acc_version', 'entry_id',
428:             'locus', 'accession', 'number'].each do |x|
429:             if i = labels.index(x) then
430:               r = di[i+1]
431:               break if r
432:             end
433:           end
434:           r = di[1..-1].find { |x| x } unless r
435:         end
436:         @info[db] = r if r
437:       end
438:       r
439:     end

Returns identifiers by given type.

[Source]

     # File lib/bio/db/fasta/defline.rb, line 454
454:     def get_all_by_type(*type_strarg)
455:       d = []
456:       @list_ids.each do |x|
457:         if labels = self.class::NSIDs[x[0]] then
458:           type_strarg.each do |y|
459:             if i = labels.index(y) then
460:               d << x[i+1] if x[i+1]
461:             end
462:           end
463:         end
464:       end
465:       d
466:     end

Returns an identifier by given type.

[Source]

     # File lib/bio/db/fasta/defline.rb, line 442
442:     def get_by_type(type_str)
443:       @list_ids.each do |x|
444:         if labels = self.class::NSIDs[x[0]] then
445:           if i = labels.index(type_str) then
446:             return x[i+1]
447:           end
448:         end
449:       end
450:       nil
451:     end

Shows GI. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.

[Source]

     # File lib/bio/db/fasta/defline.rb, line 483
483:     def gi
484:       unless defined?(@gi) then
485:         @gi = get_by_type('gi')
486:       end
487:       @gi
488:     end

Shows ID-like strings. Returns an array of strings.

[Source]

     # File lib/bio/db/fasta/defline.rb, line 350
350:     def id_strings
351:       r = []
352:       @list_ids.each do |a|
353:         if a.size >= 2 then
354:           r.concat a[1..-1].find_all { |x| x }
355:         else
356:           if a[0].to_s.size > 0 and a[0] =~ /\A[A-Za-z0-9\.\-\_]+\z/
357:             r << a[0]
358:           end
359:         end
360:       end
361:       r.concat( words(true, []).find_all do |x|
362:                  x =~ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/ or
363:                    x =~ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
364:                end)
365:       r
366:     end

Shows locus. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.

[Source]

     # File lib/bio/db/fasta/defline.rb, line 472
472:     def locus
473:       unless defined?(@locus)
474:         @locus = get_by_type('locus')
475:       end
476:       @locus
477:     end

[Source]

     # File lib/bio/db/fasta/defline.rb, line 523
523:     def method_missing(name, *args)
524:       # raise ArgumentError,
525:       # "wrong # of arguments(#{args.size} for 1)" if args.size >= 2
526:       r = get(name, *args)
527:       if !r and !(self.class::NSIDs[name.to_s]) then
528:         raise "NameError: undefined method `#{name.inspect}'"
529:       end
530:       r
531:     end

Shows original string. Note that the result of this method may be different from original string which is given in FastaDefline.new method.

[Source]

     # File lib/bio/db/fasta/defline.rb, line 329
329:     def to_s
330:       @deflines.collect { |a|
331:         s = a[0]
332:         (a[1..-2].collect { |x| x.join(s) }.join(s) + ' ' + a[-1]).strip
333:       }.join("\x01")
334:     end

Shows words used in the defline. Returns an Array.

[Source]

     # File lib/bio/db/fasta/defline.rb, line 392
392:     def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray,
393:               kwhash = self.class::KillWordsHash)
394:       a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\'\`\~\/\|\?\!\&\@\#\s\x00-\x1f\x7f]+/)
395:       a.collect! do |x|
396:         x.sub!(/\A[\$\*\-\+]+/, '')
397:         x.sub!(/[\$\*\-\=]+\z/, '')
398:         if x.size <= 1 then
399:           nil
400:         elsif kwhash[x.downcase] then
401:           nil
402:         else
403:           if kill_regexp.find { |expr| expr =~ x } then
404:             nil
405:           else
406:             x
407:           end
408:         end
409:       end
410:       a.compact!
411:       a.collect! { |x| x.downcase } unless case_sensitive
412:       a.sort!
413:       a.uniq!
414:       a
415:     end

[Validate]