Package translate :: Package tools :: Module poterminology
[hide private]
[frames] | no frames]

Source Code for Module translate.tools.poterminology

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # This file is part of translate. 
  5  # 
  6  # translate is free software; you can redistribute it and/or modify 
  7  # it under the terms of the GNU General Public License as published by 
  8  # the Free Software Foundation; either version 2 of the License, or 
  9  # (at your option) any later version. 
 10  # 
 11  # translate is distributed in the hope that it will be useful, 
 12  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 13  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 14  # GNU General Public License for more details. 
 15  # 
 16  # You should have received a copy of the GNU General Public License 
 17  # along with translate; if not, write to the Free Software 
 18  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 19   
 20  """reads a set of .po or .pot files to produce a pootle-terminology.pot 
 21   
 22  See: http://translate.sourceforge.net/wiki/toolkit/poterminology for examples and 
 23  usage instructions 
 24  """ 
 25  import os 
 26  import re 
 27  import sys 
 28  import logging 
 29   
 30  from translate.lang import factory as lang_factory 
 31  from translate.misc import optrecurse 
 32  from translate.storage import po 
 33  from translate.storage import factory 
 34  from translate.misc import file_discovery 
 35   
 36   
37 -def create_termunit(term, unit, targets, locations, sourcenotes, transnotes, filecounts):
38 termunit = po.pounit(term) 39 if unit is not None: 40 termunit.merge(unit, overwrite=False, comments=False) 41 if len(targets.keys()) > 1: 42 txt = '; '.join(["%s {%s}" % (target, ', '.join(files)) 43 for target, files in targets.iteritems()]) 44 if termunit.target.find('};') < 0: 45 termunit.target = txt 46 termunit.markfuzzy() 47 else: 48 # if annotated multiple terms already present, keep as-is 49 termunit.addnote(txt, "translator") 50 for location in locations: 51 termunit.addlocation(location) 52 for sourcenote in sourcenotes: 53 termunit.addnote(sourcenote, "developer") 54 for transnote in transnotes: 55 termunit.addnote(transnote, "translator") 56 for filename, count in filecounts.iteritems(): 57 termunit.addnote("(poterminology) %s (%d)\n" % (filename, count), 'translator') 58 return termunit
59 60
61 -class TerminologyExtractor(object):
62
63 - def __init__(self, foldtitle=True, ignorecase=False, accelchars="", termlength=3, 64 sourcelanguage="en", invert=False, stopfile=None):
65 self.foldtitle = foldtitle 66 self.ignorecase = ignorecase 67 self.accelchars = accelchars 68 self.termlength = termlength 69 70 self.sourcelanguage = sourcelanguage 71 self.invert = invert 72 73 self.stopwords = {} 74 self.stoprelist = [] 75 self.stopfoldtitle = True 76 self.stopignorecase = False 77 78 if stopfile is None: 79 try: 80 stopfile = file_discovery.get_abs_data_filename('stoplist-%s' % self.sourcelanguage) 81 except: 82 pass 83 self.stopfile = stopfile 84 self.parse_stopword_file() 85 86 # handles c-format and python-format 87 self.formatpat = re.compile(r"%(?:\([^)]+\)|[0-9]+\$)?[-+#0]*[0-9.*]*(?:[hlLzjt][hl])?[EFGXc-ginoprsux]") 88 # handles XML/HTML elements (<foo>text</foo> => text) 89 self.xmlelpat = re.compile(r"<(?:![[-]|[/?]?[A-Za-z_:])[^>]*>") 90 # handles XML/HTML entities (&#32; &#x20; &amp; &my_entity;) 91 self.xmlentpat = re.compile(r"&(?:#(?:[0-9]+|x[0-9a-f]+)|[a-z_:][\w.-:]*);", 92 flags=re.UNICODE|re.IGNORECASE) 93 94 self.units = 0 95 self.glossary = {}
96
97 - def parse_stopword_file(self):
98 99 actions = {'+': frozenset(), ':': frozenset(['skip']), 100 '<': frozenset(['phrase']), '=': frozenset(['word']), 101 '>': frozenset(['word', 'skip']), 102 '@': frozenset(['word', 'phrase'])} 103 104 stopfile = open(self.stopfile, "r") 105 line = 0 106 try: 107 for stopline in stopfile: 108 line += 1 109 stoptype = stopline[0] 110 if stoptype == '#' or stoptype == "\n": 111 continue 112 elif stoptype == '!': 113 if stopline[1] == 'C': 114 self.stopfoldtitle = False 115 self.stopignorecase = False 116 elif stopline[1] == 'F': 117 self.stopfoldtitle = True 118 self.stopignorecase = False 119 elif stopline[1] == 'I': 120 self.stopignorecase = True 121 else: 122 logging.warning("%s line %d - bad case mapping directive", (self.stopfile, line)) 123 elif stoptype == '/': 124 self.stoprelist.append(re.compile(stopline[1:-1]+'$')) 125 else: 126 self.stopwords[stopline[1:-1]] = actions[stoptype] 127 except KeyError, character: 128 logging.warning("%s line %d - bad stopword entry starts with", (self.stopfile, line)) 129 logging.warning("%s line %d all lines after error ignored", (self.stopfile, line + 1)) 130 stopfile.close()
131
132 - def clean(self, string):
133 """returns the cleaned string that contains the text to be matched""" 134 for accelerator in self.accelchars: 135 string = string.replace(accelerator, "") 136 string = self.formatpat.sub(" ", string) 137 string = self.xmlelpat.sub(" ", string) 138 string = self.xmlentpat.sub(" ", string) 139 string = string.strip() 140 return string
141
142 - def stopmap(self, word):
143 """return case-mapped stopword for input word""" 144 if self.stopignorecase or (self.stopfoldtitle and word.istitle()): 145 word = word.lower() 146 return word
147
148 - def stopword(self, word, defaultset=frozenset()):
149 """return stoplist frozenset for input word""" 150 return self.stopwords.get(self.stopmap(word), defaultset)
151
152 - def addphrases(self, words, skips, translation, partials=True):
153 """adds (sub)phrases with non-skipwords and more than one word""" 154 if (len(words) > skips + 1 and 155 'skip' not in self.stopword(words[0]) and 156 'skip' not in self.stopword(words[-1])): 157 self.glossary.setdefault(' '.join(words), []).append(translation) 158 if partials: 159 part = list(words) 160 while len(part) > 2: 161 if 'skip' in self.stopword(part.pop()): 162 skips -= 1 163 if (len(part) > skips + 1 and 164 'skip' not in self.stopword(part[0]) and 165 'skip' not in self.stopword(part[-1])): 166 self.glossary.setdefault(' '.join(part), []).append(translation)
167
168 - def processunits(self, units, fullinputpath):
169 sourcelang = lang_factory.getlanguage(self.sourcelanguage) 170 rematchignore = frozenset(('word', 'phrase')) 171 defaultignore = frozenset() 172 for unit in units: 173 self.units += 1 174 if unit.isheader(): 175 continue 176 if not self.invert: 177 source = self.clean(unit.source) 178 target = self.clean(unit.target) 179 else: 180 target = self.clean(unit.source) 181 source = self.clean(unit.target) 182 if len(source) <= 1: 183 continue 184 for sentence in sourcelang.sentences(source): 185 words = [] 186 skips = 0 187 for word in sourcelang.words(sentence): 188 stword = self.stopmap(word) 189 if self.ignorecase or (self.foldtitle and word.istitle()): 190 word = word.lower() 191 ignore = defaultignore 192 if stword in self.stopwords: 193 ignore = self.stopwords[stword] 194 else: 195 for stopre in self.stoprelist: 196 if stopre.match(stword) != None: 197 ignore = rematchignore 198 break 199 translation = (source, target, unit, fullinputpath) 200 if 'word' not in ignore: 201 # reduce plurals 202 root = word 203 if len(word) > 3 and word[-1] == 's' and word[0:-1] in self.glossary: 204 root = word[0:-1] 205 elif len(root) > 2 and root + 's' in self.glossary: 206 self.glossary[root] = self.glossary.pop(root + 's') 207 self.glossary.setdefault(root, []).append(translation) 208 if self.termlength > 1: 209 if 'phrase' in ignore: 210 # add trailing phrases in previous words 211 while len(words) > 2: 212 if 'skip' in self.stopword(words.pop(0)): 213 skips -= 1 214 self.addphrases(words, skips, translation) 215 words = [] 216 skips = 0 217 else: 218 words.append(word) 219 if 'skip' in ignore: 220 skips += 1 221 if len(words) > self.termlength + skips: 222 while len(words) > self.termlength + skips: 223 if 'skip' in self.stopword(words.pop(0)): 224 skips -= 1 225 self.addphrases(words, skips, translation) 226 else: 227 self.addphrases(words, skips, translation, partials=False) 228 if self.termlength > 1: 229 # add trailing phrases in sentence after reaching end 230 while self.termlength > 1 and len(words) > 2: 231 232 if 'skip' in self.stopword(words.pop(0)): 233 skips -= 1 234 self.addphrases(words, skips, translation)
235
236 - def extract_terms(self, create_termunit=create_termunit, inputmin=1, fullmsgmin=1, substrmin=2, locmin=2):
237 terms = {} 238 locre = re.compile(r":[0-9]+$") 239 print >> sys.stderr, ("%d terms from %d units" % 240 (len(self.glossary), self.units)) 241 for term, translations in self.glossary.iteritems(): 242 if len(translations) <= 1: 243 continue 244 filecounts = {} 245 sources = set() 246 locations = set() 247 sourcenotes = set() 248 transnotes = set() 249 targets = {} 250 fullmsg = False 251 bestunit = None 252 for source, target, unit, filename in translations: 253 sources.add(source) 254 filecounts[filename] = filecounts.setdefault(filename, 0) + 1 255 #FIXME: why reclean source and target?! 256 if term.lower() == self.clean(unit.source).lower(): 257 fullmsg = True 258 target = self.clean(unit.target) 259 if self.ignorecase or (self.foldtitle and target.istitle()): 260 target = target.lower() 261 unit.target = target 262 if target != "": 263 targets.setdefault(target, []).append(filename) 264 if term.lower() == unit.source.strip().lower(): 265 sourcenotes.add(unit.getnotes("source code")) 266 transnotes.add(unit.getnotes("translator")) 267 unit.source = term 268 bestunit = unit 269 #FIXME: figure out why we did a merge to begin with 270 #termunit.merge(unit, overwrite=False, comments=False) 271 for loc in unit.getlocations(): 272 locations.add(locre.sub("", loc)) 273 274 numsources = len(sources) 275 numfiles = len(filecounts) 276 numlocs = len(locations) 277 if numfiles < inputmin or 0 < numlocs < locmin: 278 continue 279 if fullmsg: 280 if numsources < fullmsgmin: 281 continue 282 elif numsources < substrmin: 283 continue 284 285 locmax = 2 * locmin 286 if numlocs > locmax: 287 locations = list(locations)[0:locmax] 288 locations.append("(poterminology) %d more locations" 289 % (numlocs - locmax)) 290 291 termunit = create_termunit(term, bestunit, targets, locations, sourcenotes, transnotes, filecounts) 292 terms[term] = ((10 * numfiles) + numsources, termunit) 293 return terms
294
295 - def filter_terms(self, terms, nonstopmin=1, sortorders=["frequency", "dictionary", "length"]):
296 """reduce subphrases from extracted terms""" 297 # reduce subphrase 298 termlist = terms.keys() 299 print >> sys.stderr, "%d terms after thresholding" % len(termlist) 300 termlist.sort(lambda x, y: cmp(len(x), len(y))) 301 for term in termlist: 302 words = term.split() 303 nonstop = [word for word in words if not self.stopword(word)] 304 if len(nonstop) < nonstopmin and len(nonstop) != len(words): 305 del terms[term] 306 continue 307 if len(words) <= 2: 308 continue 309 while len(words) > 2: 310 words.pop() 311 if terms[term][0] == terms.get(' '.join(words), [0])[0]: 312 del terms[' '.join(words)] 313 words = term.split() 314 while len(words) > 2: 315 words.pop(0) 316 if terms[term][0] == terms.get(' '.join(words), [0])[0]: 317 del terms[' '.join(words)] 318 print >> sys.stderr, "%d terms after subphrase reduction" % len(terms.keys()) 319 termitems = terms.values() 320 while len(sortorders) > 0: 321 order = sortorders.pop() 322 if order == "frequency": 323 termitems.sort(lambda x, y: cmp(y[0], x[0])) 324 elif order == "dictionary": 325 termitems.sort(lambda x, y: cmp(x[1].source.lower(), y[1].source.lower())) 326 elif order == "length": 327 termitems.sort(lambda x, y: cmp(len(x[1].source), len(y[1].source))) 328 else: 329 logging.warning("unknown sort order %s", order) 330 return termitems
331 332
333 -class TerminologyOptionParser(optrecurse.RecursiveOptionParser):
334 """a specialized Option Parser for the terminology tool...""" 335
336 - def parse_args(self, args=None, values=None):
337 """parses the command line options, handling implicit input/output args""" 338 (options, args) = optrecurse.optparse.OptionParser.parse_args(self, args, values) 339 # some intelligence as to what reasonable people might give on the command line 340 if args and not options.input: 341 if not options.output and not options.update and len(args) > 1: 342 options.input = args[:-1] 343 args = args[-1:] 344 else: 345 options.input = args 346 args = [] 347 # don't overwrite last freestanding argument file, to avoid accidents 348 # due to shell wildcard expansion 349 if args and not options.output and not options.update: 350 if os.path.lexists(args[-1]) and not os.path.isdir(args[-1]): 351 self.error("To overwrite %s, specify it with -o/--output or -u/--update" % (args[-1])) 352 options.output = args[-1] 353 args = args[:-1] 354 if options.output and options.update: 355 self.error("You cannot use both -u/--update and -o/--output") 356 if args: 357 self.error("You have used an invalid combination of -i/--input, -o/--output, -u/--update and freestanding args") 358 if not options.input: 359 self.error("No input file or directory was specified") 360 if isinstance(options.input, list) and len(options.input) == 1: 361 options.input = options.input[0] 362 if options.inputmin == None: 363 options.inputmin = 1 364 elif not isinstance(options.input, list) and not os.path.isdir(options.input): 365 if options.inputmin == None: 366 options.inputmin = 1 367 elif options.inputmin == None: 368 options.inputmin = 2 369 if options.update: 370 options.output = options.update 371 if isinstance(options.input, list): 372 options.input.append(options.update) 373 elif options.input: 374 options.input = [options.input, options.update] 375 else: 376 options.input = options.update 377 if not options.output: 378 options.output = "pootle-terminology.pot" 379 return (options, args)
380
381 - def set_usage(self, usage=None):
382 """sets the usage string - if usage not given, uses getusagestring for each option""" 383 if usage is None: 384 self.usage = "%prog " + " ".join([self.getusagestring(option) for option in self.option_list]) + \ 385 "\n input directory is searched for PO files, terminology PO file is output file" 386 else: 387 super(TerminologyOptionParser, self).set_usage(usage)
388
389 - def run(self):
390 """parses the arguments, and runs recursiveprocess with the resulting options""" 391 self.files = 0 392 (options, args) = self.parse_args() 393 options.inputformats = self.inputformats 394 options.outputoptions = self.outputoptions 395 self.usepsyco(options) 396 self.extractor = TerminologyExtractor(foldtitle=options.foldtitle, ignorecase=options.ignorecase, 397 accelchars=options.accelchars, termlength=options.termlength, 398 sourcelanguage=options.sourcelanguage, 399 invert=options.invert, stopfile=options.stopfile) 400 self.recursiveprocess(options)
401
402 - def recursiveprocess(self, options):
403 """recurse through directories and process files""" 404 if self.isrecursive(options.input, 'input') and getattr(options, "allowrecursiveinput", True): 405 if isinstance(options.input, list): 406 inputfiles = self.recurseinputfilelist(options) 407 else: 408 inputfiles = self.recurseinputfiles(options) 409 else: 410 if options.input: 411 inputfiles = [os.path.basename(options.input)] 412 options.input = os.path.dirname(options.input) 413 else: 414 inputfiles = [options.input] 415 if os.path.isdir(options.output): 416 options.output = os.path.join(options.output, "pootle-terminology.pot") 417 418 self.initprogressbar(inputfiles, options) 419 for inputpath in inputfiles: 420 self.files += 1 421 fullinputpath = self.getfullinputpath(options, inputpath) 422 success = True 423 try: 424 self.processfile(None, options, fullinputpath) 425 except Exception, error: 426 if isinstance(error, KeyboardInterrupt): 427 raise 428 self.warning("Error processing: input %s" % (fullinputpath), options, sys.exc_info()) 429 success = False 430 self.reportprogress(inputpath, success) 431 del self.progressbar 432 self.outputterminology(options)
433
434 - def processfile(self, fileprocessor, options, fullinputpath):
435 """process an individual file""" 436 inputfile = self.openinputfile(options, fullinputpath) 437 inputfile = factory.getobject(inputfile) 438 self.extractor.processunits(inputfile.units, fullinputpath)
439
440 - def outputterminology(self, options):
441 """saves the generated terminology glossary""" 442 termfile = po.pofile() 443 print >> sys.stderr, ("scanned %d files" % self.files) 444 terms = self.extractor.extract_terms(inputmin=options.inputmin, fullmsgmin=options.fullmsgmin, 445 substrmin=options.substrmin, locmin=options.locmin) 446 termitems = self.extractor.filter_terms(terms, nonstopmin=options.nonstopmin, sortorders=options.sortorders) 447 for count, unit in termitems: 448 termfile.units.append(unit) 449 open(options.output, "w").write(str(termfile))
450 451
452 -def fold_case_option(option, opt_str, value, parser):
453 parser.values.ignorecase = False 454 parser.values.foldtitle = True
455 456
457 -def preserve_case_option(option, opt_str, value, parser):
458 parser.values.ignorecase = parser.values.foldtitle = False
459 460
461 -def main():
462 formats = {"po": ("po", None), "pot": ("pot", None), None: ("po", None)} 463 parser = TerminologyOptionParser(formats) 464 465 parser.add_option("-u", "--update", type="string", dest="update", 466 metavar="UPDATEFILE", help="update terminology in UPDATEFILE") 467 468 parser.add_option("-S", "--stopword-list", type="string", metavar="STOPFILE", dest="stopfile", 469 help="read stopword (term exclusion) list from STOPFILE (default %s)" % 470 file_discovery.get_abs_data_filename('stoplist-en')) 471 472 parser.set_defaults(foldtitle=True, ignorecase=False) 473 parser.add_option("-F", "--fold-titlecase", callback=fold_case_option, 474 action="callback", help="fold \"Title Case\" to lowercase (default)") 475 parser.add_option("-C", "--preserve-case", callback=preserve_case_option, 476 action="callback", help="preserve all uppercase/lowercase") 477 parser.add_option("-I", "--ignore-case", dest="ignorecase", 478 action="store_true", help="make all terms lowercase") 479 480 parser.add_option("", "--accelerator", dest="accelchars", default="", 481 metavar="ACCELERATORS", help="ignores the given accelerator characters when matching") 482 483 parser.add_option("-t", "--term-words", type="int", dest="termlength", default="3", 484 help="generate terms of up to LENGTH words (default 3)", metavar="LENGTH") 485 parser.add_option("", "--nonstop-needed", type="int", dest="nonstopmin", default="1", 486 help="omit terms with less than MIN nonstop words (default 1)", metavar="MIN") 487 parser.add_option("", "--inputs-needed", type="int", dest="inputmin", 488 help="omit terms appearing in less than MIN input files (default 2, or 1 if only one input file)", metavar="MIN") 489 parser.add_option("", "--fullmsg-needed", type="int", dest="fullmsgmin", default="1", 490 help="omit full message terms appearing in less than MIN different messages (default 1)", metavar="MIN") 491 parser.add_option("", "--substr-needed", type="int", dest="substrmin", default="2", 492 help="omit substring-only terms appearing in less than MIN different messages (default 2)", metavar="MIN") 493 parser.add_option("", "--locs-needed", type="int", dest="locmin", default="2", 494 help="omit terms appearing in less than MIN different original source files (default 2)", metavar="MIN") 495 496 sortorders_default = ["frequency", "dictionary", "length"] 497 parser.add_option("", "--sort", dest="sortorders", action="append", 498 type="choice", choices=sortorders_default, metavar="ORDER", default=sortorders_default, 499 help="output sort order(s): %s (default is all orders in the above priority)" % ', '.join(sortorders_default)) 500 501 parser.add_option("", "--source-language", dest="sourcelanguage", default="en", 502 help="the source language code (default 'en')", metavar="LANG") 503 parser.add_option("-v", "--invert", dest="invert", 504 action="store_true", default=False, help="invert the source and target languages for terminology") 505 parser.set_usage() 506 parser.description = __doc__ 507 parser.run()
508 509 510 if __name__ == '__main__': 511 main() 512