Class | Bio::FastaDefline |
In: |
lib/bio/db/fasta/defline.rb
|
Parent: | Object |
Parsing FASTA Defline, and extract IDs and other informations. IDs are NSIDs (NCBI standard FASTA sequence identifiers) or ":"-separated IDs.
specs are described in: ftp.ncbi.nih.gov/blast/documents/README.formatdb blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]') rub.entry_id ==> 'gi|671595' rub.get('emb') ==> 'CAA85678.1' rub.emb ==> 'CAA85678.1' rub.gi ==> '671595' rub.accession ==> 'CAA85678' rub.accessions ==> [ 'CAA85678' ] rub.acc_version ==> 'CAA85678.1' rub.locus ==> nil rub.list_ids ==> [["gi", "671595"], ["emb", "CAA85678.1", nil], ["Perovskia abrotanoides"]] ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]") ckr.entry_id ==> "gi|2495000" ckr.sp ==> "CCKR_CAVPO" ckr.pir ==> "I51898" ckr.gb ==> "AAB29504.1" ckr.gi ==> "2495000" ckr.accession ==> "AAB29504" ckr.accessions ==> ["Q63931", "AAB29504"] ckr.acc_version ==> "AAB29504.1" ckr.locus ==> nil ckr.description ==> "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)" ckr.descriptions ==> ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)", "cholecystokinin A receptor - guinea pig", "cholecystokinin A receptor; CCK-A receptor [Cavia]"] ckr.words ==> ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig", "receptor", "type"] ckr.id_strings ==> ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898", "544724", "AAB29504.1", "Cavia"] ckr.list_ids ==> [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"], ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"], ["gb", "AAB29504.1", nil], ["Cavia"]]
NSIDs | = | { # NCBI and WU-BLAST 'gi' => [ 'gi' ], # NCBI GI 'gb' => [ 'acc_version', 'locus' ], # GenBank 'emb' => [ 'acc_version', 'locus' ], # EMBL 'dbj' => [ 'acc_version', 'locus' ], # DDBJ 'sp' => [ 'accession', 'entry_id' ], # SWISS-PROT 'tr' => [ 'accession', 'entry_id' ], # TREMBL 'pdb' => [ 'entry_id', 'chain' ], # PDB 'bbs' => [ 'number' ], # GenInfo Backbone Id 'gnl' => [ 'database' , 'entry_id' ], # General database identifier 'ref' => [ 'acc_version' , 'locus' ], # NCBI Reference Sequence 'lcl' => [ 'entry_id' ], # Local Sequence identifier # WU-BLAST and NCBI 'pir' => [ 'accession', 'entry_id' ], # PIR 'prf' => [ 'accession', 'entry_id' ], # Protein Research Foundation 'pat' => [ 'country', 'number', 'serial' ], # Patents # WU-BLAST only 'bbm' => [ 'number' ], # NCBI GenInfo Backbone database identifier 'gim' => [ 'number' ], # NCBI GenInfo Import identifier 'gp' => [ 'acc_version', 'locus' ], # GenPept 'oth' => [ 'accession', 'name', 'release' ], # Other (user-definable) identifier 'tpd' => [ 'accession', 'name' ], # Third party annotation, DDBJ 'tpe' => [ 'accession', 'name' ], # Third party annotation, EMBL 'tpg' => [ 'accession', 'name' ], # Third party annotation, GenBank # Original 'ri' => [ 'entry_id', 'rearray_id', 'len' ], # RIKEN FANTOM DB } |
KillWords | = | [ 'an', 'the', 'this', 'that', 'is', 'are', 'were', 'was', 'be', 'can', 'may', 'might', 'as', 'at', 'by', 'for', 'in', 'of', 'on', 'to', 'with', 'from', 'and', 'or', 'not', 'dna', 'rna', 'mrna', 'cdna', 'orf', 'aa', 'nt', 'pct', 'id', 'ec', 'sp', 'subsp', 'similar', 'involved', 'identical', 'identity', 'cds', 'clone', 'library', 'contig', 'contigs', 'homolog', 'homologue', 'homologs', 'homologous', 'protein', 'proteins', 'gene', 'genes', 'product', 'products', 'sequence', 'sequences', 'strain', 'strains', 'region', 'regions', ] |
KillWordsHash | = | {} |
KillRegexpArray | = | [ /\A\d{1,3}\%?\z/, /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/, /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/ |
entry_id | [R] | Shows a possibly unique identifier. Returns a string. |
list_ids | [R] | Shows array that contains IDs (or ID-like strings). Returns an array of arrays of strings. |
Parses given string.
# File lib/bio/db/fasta/defline.rb, line 181 181: def initialize(str) 182: @deflines = [] 183: @info = {} 184: @list_ids = [] 185: 186: @entry_id = nil 187: 188: lines = str.split("\x01") 189: lines.each do |line| 190: add_defline(line) 191: end 192: end
Shows accession with version number. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.
# File lib/bio/db/fasta/defline.rb, line 494 494: def acc_version 495: unless defined?(@acc_version) then 496: @acc_version = get_by_type('acc_version') 497: end 498: @acc_version 499: end
Shows accession numbers. Returns an array of strings.
# File lib/bio/db/fasta/defline.rb, line 503 503: def accessions 504: unless defined?(@accessions) then 505: @accessions = get_all_by_type('accession', 'acc_version') 506: @accessions.collect! { |x| x.sub(/\..*\z/, '') } 507: end 508: @accessions 509: end
Parses given string and adds parsed data.
# File lib/bio/db/fasta/defline.rb, line 195 195: def add_defline(str) 196: case str 197: when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/ 198: # NSIDs 199: # examples: 200: # >gi|9910844|sp|Q9UWG2|RL3_METVA 50S ribosomal protein L3P 201: # 202: # note: regexp (:?) means grouping without backreferences 203: i = $1 204: d = $2 205: tks = i.split('|') 206: tks << '' if i[-1,1] == '|' 207: a = parse_NSIDs(tks) 208: i = a[0].join('|') 209: a.unshift('|') 210: d = tks.join('|') + ' ' + d unless tks.empty? 211: a << d 212: this_line = a 213: match_EC(d) 214: parse_square_brackets(d).each do |x| 215: if !match_EC(x, false) and x =~ /\A[A-Z]/ then 216: di = [ x ] 217: @list_ids << di 218: @info['organism'] = x unless @info['organism'] 219: end 220: end 221: 222: when /^\>?\s*([a-zA-Z0-9]+\:[^\s]+)\s*(.*)$/ 223: # examples: 224: # >sce:YBR160W CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST] 225: # >emb:CACDC28 [X80034] C.albicans CDC28 gene 226: i = $1 227: d = $2 228: a = parse_ColonSepID(i) 229: i = a.join(':') 230: this_line = [ ':', a , d ] 231: match_EC(d) 232: parse_square_brackets(d).each do |x| 233: if !match_EC(x, false) and x =~ /:/ then 234: parse_ColonSepID(x) 235: elsif x =~ /\A\s*([A-Z][A-Z0-9_\.]+)\s*\z/ then 236: @list_ids << [ $1 ] 237: end 238: end 239: 240: when /^\>?\s*(\S+)(?:\s+(.+))?$/ 241: # examples: 242: # >ABC12345 this is test 243: i = $1 244: d = $2.to_s 245: @list_ids << [ i.chomp('.') ] 246: this_line = [ '', [ i ], d ] 247: match_EC(d) 248: else 249: i = str 250: d = '' 251: match_EC(i) 252: this_line = [ '', [ i ], d ] 253: end 254: 255: @deflines << this_line 256: @entry_id = i unless @entry_id 257: end
Shows description.
# File lib/bio/db/fasta/defline.rb, line 337 337: def description 338: @deflines[0].to_a[-1] 339: end
Returns descriptions.
# File lib/bio/db/fasta/defline.rb, line 342 342: def descriptions 343: @deflines.collect do |a| 344: a[-1] 345: end 346: end
Returns identifires by a database name.
# File lib/bio/db/fasta/defline.rb, line 418 418: def get(dbname) 419: db = dbname.to_s 420: r = nil 421: unless r = @info[db] then 422: di = @list_ids.find { |x| x[0] == db.to_s } 423: if di and di.size <= 2 then 424: r = di[-1] 425: elsif di then 426: labels = self.class::NSIDs[db] 427: [ 'acc_version', 'entry_id', 428: 'locus', 'accession', 'number'].each do |x| 429: if i = labels.index(x) then 430: r = di[i+1] 431: break if r 432: end 433: end 434: r = di[1..-1].find { |x| x } unless r 435: end 436: @info[db] = r if r 437: end 438: r 439: end
Returns identifiers by given type.
# File lib/bio/db/fasta/defline.rb, line 454 454: def get_all_by_type(*type_strarg) 455: d = [] 456: @list_ids.each do |x| 457: if labels = self.class::NSIDs[x[0]] then 458: type_strarg.each do |y| 459: if i = labels.index(y) then 460: d << x[i+1] if x[i+1] 461: end 462: end 463: end 464: end 465: d 466: end
Returns an identifier by given type.
# File lib/bio/db/fasta/defline.rb, line 442 442: def get_by_type(type_str) 443: @list_ids.each do |x| 444: if labels = self.class::NSIDs[x[0]] then 445: if i = labels.index(type_str) then 446: return x[i+1] 447: end 448: end 449: end 450: nil 451: end
Shows GI. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.
# File lib/bio/db/fasta/defline.rb, line 483 483: def gi 484: unless defined?(@gi) then 485: @gi = get_by_type('gi') 486: end 487: @gi 488: end
Shows ID-like strings. Returns an array of strings.
# File lib/bio/db/fasta/defline.rb, line 350 350: def id_strings 351: r = [] 352: @list_ids.each do |a| 353: if a.size >= 2 then 354: r.concat a[1..-1].find_all { |x| x } 355: else 356: if a[0].to_s.size > 0 and a[0] =~ /\A[A-Za-z0-9\.\-\_]+\z/ 357: r << a[0] 358: end 359: end 360: end 361: r.concat( words(true, []).find_all do |x| 362: x =~ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/ or 363: x =~ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/ 364: end) 365: r 366: end
# File lib/bio/db/fasta/defline.rb, line 523 523: def method_missing(name, *args) 524: # raise ArgumentError, 525: # "wrong # of arguments(#{args.size} for 1)" if args.size >= 2 526: r = get(name, *args) 527: if !r and !(self.class::NSIDs[name.to_s]) then 528: raise "NameError: undefined method `#{name.inspect}'" 529: end 530: r 531: end
Shows original string. Note that the result of this method may be different from original string which is given in FastaDefline.new method.
# File lib/bio/db/fasta/defline.rb, line 329 329: def to_s 330: @deflines.collect { |a| 331: s = a[0] 332: (a[1..-2].collect { |x| x.join(s) }.join(s) + ' ' + a[-1]).strip 333: }.join("\x01") 334: end
Shows words used in the defline. Returns an Array.
# File lib/bio/db/fasta/defline.rb, line 392 392: def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray, 393: kwhash = self.class::KillWordsHash) 394: a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\'\`\~\/\|\?\!\&\@\#\s\x00-\x1f\x7f]+/) 395: a.collect! do |x| 396: x.sub!(/\A[\$\*\-\+]+/, '') 397: x.sub!(/[\$\*\-\=]+\z/, '') 398: if x.size <= 1 then 399: nil 400: elsif kwhash[x.downcase] then 401: nil 402: else 403: if kill_regexp.find { |expr| expr =~ x } then 404: nil 405: else 406: x 407: end 408: end 409: end 410: a.compact! 411: a.collect! { |x| x.downcase } unless case_sensitive 412: a.sort! 413: a.uniq! 414: a 415: end