Package translate :: Package tools :: Module pogrep
[hide private]
[frames] | no frames]

Source Code for Module translate.tools.pogrep

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  #  
  4  # Copyright 2002-2008 Zuza Software Foundation 
  5  #  
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  #  
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """Grep XLIFF, Gettext PO and TMX localization files 
 23   
 24  Matches are output to snippet files of the same type which can then be reviewed  
 25  and later merged using pomerge 
 26   
 27  See: http://translate.sourceforge.net/wiki/toolkit/pogrep for examples and 
 28  usage instructions 
 29  """ 
 30   
 31  from translate.storage import factory 
 32  from translate.misc import optrecurse 
 33  from translate.misc.multistring import multistring 
 34  from translate.lang import data 
 35  import re 
 36  import locale 
 37   
 38   
39 -class GrepMatch(object):
40 """Just a small data structure that represents a search match.""" 41 42 # INITIALIZERS #
43 - def __init__(self, unit, part='target', part_n=0, start=0, end=0):
44 self.unit = unit 45 self.part = part 46 self.part_n = part_n 47 self.start = start 48 self.end = end
49 50 # ACCESSORS #
51 - def get_getter(self):
52 if self.part == 'target': 53 if self.unit.hasplural(): 54 getter = lambda: self.unit.target.strings[self.part_n] 55 else: 56 getter = lambda: self.unit.target 57 return getter 58 elif self.part == 'source': 59 if self.unit.hasplural(): 60 getter = lambda: self.unit.source.strings[self.part_n] 61 else: 62 getter = lambda: self.unit.source 63 return getter 64 elif self.part == 'notes': 65 def getter(): 66 return self.unit.getnotes()[self.part_n]
67 return getter 68 elif self.part == 'locations': 69 def getter(): 70 return self.unit.getlocations()[self.part_n]
71 return getter 72
73 - def get_setter(self):
74 if self.part == 'target': 75 if self.unit.hasplural(): 76 def setter(value): 77 strings = self.unit.target.strings 78 strings[self.part_n] = value 79 self.unit.target = strings
80 else: 81 def setter(value): 82 self.unit.target = value 83 return setter 84 85 # SPECIAL METHODS #
86 - def __str__(self):
87 start, end = self.start, self.end 88 if start < 3: 89 start = 3 90 if end > len(self.get_getter()()) - 3: 91 end = len(self.get_getter()()) - 3 92 matchpart = self.get_getter()()[start-2:end+2] 93 return '<GrepMatch "%s" part=%s[%d] start=%d end=%d>' % (matchpart, self.part, self.part_n, self.start, self.end)
94
95 - def __repr__(self):
96 return str(self)
97
98 -def real_index(string, nfc_index):
99 """Calculate the real index in the unnormalized string that corresponds to 100 the index nfc_index in the normalized string.""" 101 length = nfc_index 102 max_length = len(string) 103 while len(data.normalize(string[:length])) <= nfc_index: 104 if length == max_length: 105 return length 106 length += 1 107 return length - 1
108 109
110 -def find_matches(unit, part, strings, re_search):
111 """Return the GrepFilter objects where re_search matches in strings.""" 112 matches = [] 113 for n, string in enumerate(strings): 114 if not string: 115 continue 116 normalized = data.normalize(string) 117 for matchobj in re_search.finditer(normalized): 118 start = real_index(string, matchobj.start()) 119 end = real_index(string, matchobj.end()) 120 matches.append(GrepMatch(unit, part=part, part_n=n, start=start, end=end)) 121 return matches
122
123 -class GrepFilter:
124 - def __init__(self, searchstring, searchparts, ignorecase=False, useregexp=False, 125 invertmatch=False, accelchar=None, encoding='utf-8', includeheader=False, 126 max_matches=0):
127 """builds a checkfilter using the given checker""" 128 if isinstance(searchstring, unicode): 129 self.searchstring = searchstring 130 else: 131 self.searchstring = searchstring.decode(encoding) 132 self.searchstring = data.normalize(self.searchstring) 133 if searchparts: 134 # For now we still support the old terminology, except for the old 'source' 135 # which has a new meaning now. 136 self.search_source = ('source' in searchparts) or ('msgid' in searchparts) 137 self.search_target = ('target' in searchparts) or ('msgstr' in searchparts) 138 self.search_notes = ('notes' in searchparts) or ('comment' in searchparts) 139 self.search_locations = 'locations' in searchparts 140 else: 141 self.search_source = True 142 self.search_target = True 143 self.search_notes = False 144 self.search_locations = False 145 self.ignorecase = ignorecase 146 if self.ignorecase: 147 self.searchstring = self.searchstring.lower() 148 self.useregexp = useregexp 149 if self.useregexp: 150 self.searchpattern = re.compile(self.searchstring) 151 self.invertmatch = invertmatch 152 self.accelchar = accelchar 153 self.includeheader = includeheader 154 self.max_matches = max_matches
155
156 - def matches(self, teststr):
157 if teststr is None: 158 return False 159 teststr = data.normalize(teststr) 160 if self.ignorecase: 161 teststr = teststr.lower() 162 if self.accelchar: 163 teststr = re.sub(self.accelchar + self.accelchar, "#", teststr) 164 teststr = re.sub(self.accelchar, "", teststr) 165 if self.useregexp: 166 found = self.searchpattern.search(teststr) 167 else: 168 found = teststr.find(self.searchstring) != -1 169 if self.invertmatch: 170 found = not found 171 return found
172
173 - def filterunit(self, unit):
174 """runs filters on an element""" 175 if unit.isheader(): return [] 176 177 if self.search_source: 178 if isinstance(unit.source, multistring): 179 strings = unit.source.strings 180 else: 181 strings = [unit.source] 182 for string in strings: 183 if self.matches(string): 184 return True 185 186 if self.search_target: 187 if isinstance(unit.target, multistring): 188 strings = unit.target.strings 189 else: 190 strings = [unit.target] 191 for string in strings: 192 if self.matches(string): 193 return True 194 195 if self.search_notes: 196 if self.matches(unit.getnotes()): 197 return True 198 if self.search_locations: 199 if self.matches(u" ".join(unit.getlocations())): 200 return True 201 return False
202
203 - def filterfile(self, thefile):
204 """runs filters on a translation file object""" 205 thenewfile = type(thefile)() 206 thenewfile.setsourcelanguage(thefile.sourcelanguage) 207 thenewfile.settargetlanguage(thefile.targetlanguage) 208 for unit in thefile.units: 209 if self.filterunit(unit): 210 thenewfile.addunit(unit) 211 if self.includeheader and thenewfile.units > 0: 212 if thefile.units[0].isheader(): 213 thenewfile.units.insert(0, thefile.units[0]) 214 else: 215 thenewfile.units.insert(0, thenewfile.makeheader()) 216 return thenewfile
217
218 - def getmatches(self, units):
219 if not self.searchstring: 220 return [], [] 221 222 searchstring = self.searchstring 223 flags = re.LOCALE | re.MULTILINE | re.UNICODE 224 225 if self.ignorecase: 226 flags |= re.IGNORECASE 227 if not self.useregexp: 228 searchstring = re.escape(searchstring) 229 self.re_search = re.compile(u'(%s)' % (searchstring), flags) 230 231 matches = [] 232 indexes = [] 233 234 for index, unit in enumerate(units): 235 old_length = len(matches) 236 237 if self.search_target: 238 if unit.hasplural(): 239 targets = unit.target.strings 240 else: 241 targets = [unit.target] 242 matches.extend(find_matches(unit, 'target', targets, self.re_search)) 243 if self.search_source: 244 if unit.hasplural(): 245 sources = unit.source.strings 246 else: 247 sources = [unit.source] 248 matches.extend(find_matches(unit, 'source', sources, self.re_search)) 249 if self.search_notes: 250 matches.extend(find_matches(unit, 'notes', unit.getnotes(), self.re_search)) 251 252 if self.search_locations: 253 matches.extend(find_matches(unit, 'locations', unit.getlocations(), self.re_search)) 254 255 # A search for a single letter or an all-inclusive regular 256 # expression could give enough results to cause performance 257 # problems. The answer is probably not very useful at this scale. 258 if self.max_matches and len(matches) > self.max_matches: 259 raise Exception("Too many matches found") 260 261 if len(matches) > old_length: 262 old_length = len(matches) 263 indexes.append(index) 264 265 return matches, indexes
266
267 -class GrepOptionParser(optrecurse.RecursiveOptionParser):
268 """a specialized Option Parser for the grep tool..."""
269 - def parse_args(self, args=None, values=None):
270 """parses the command line options, handling implicit input/output args""" 271 (options, args) = optrecurse.optparse.OptionParser.parse_args(self, args, values) 272 # some intelligence as to what reasonable people might give on the command line 273 if args: 274 options.searchstring = args[0] 275 args = args[1:] 276 else: 277 self.error("At least one argument must be given for the search string") 278 if args and not options.input: 279 if not options.output: 280 options.input = args[:-1] 281 args = args[-1:] 282 else: 283 options.input = args 284 args = [] 285 if args and not options.output: 286 options.output = args[-1] 287 args = args[:-1] 288 if args: 289 self.error("You have used an invalid combination of --input, --output and freestanding args") 290 if isinstance(options.input, list) and len(options.input) == 1: 291 options.input = options.input[0] 292 return (options, args)
293
294 - def set_usage(self, usage=None):
295 """sets the usage string - if usage not given, uses getusagestring for each option""" 296 if usage is None: 297 self.usage = "%prog searchstring " + " ".join([self.getusagestring(option) for option in self.option_list]) 298 else: 299 super(GrepOptionParser, self).set_usage(usage)
300
301 - def run(self):
302 """parses the arguments, and runs recursiveprocess with the resulting options""" 303 (options, args) = self.parse_args() 304 options.inputformats = self.inputformats 305 options.outputoptions = self.outputoptions 306 options.checkfilter = GrepFilter(options.searchstring, options.searchparts, options.ignorecase, options.useregexp, options.invertmatch, options.accelchar, locale.getpreferredencoding(), options.includeheader) 307 self.usepsyco(options) 308 self.recursiveprocess(options)
309
310 -def rungrep(inputfile, outputfile, templatefile, checkfilter):
311 """reads in inputfile, filters using checkfilter, writes to outputfile""" 312 fromfile = factory.getobject(inputfile) 313 tofile = checkfilter.filterfile(fromfile) 314 if tofile.isempty(): 315 return False 316 outputfile.write(str(tofile)) 317 return True
318
319 -def cmdlineparser():
320 formats = {"po":("po", rungrep), "pot":("pot", rungrep), 321 "mo":("mo", rungrep), "gmo":("gmo", rungrep), 322 "tmx":("tmx", rungrep), 323 "xliff":("xliff", rungrep), "xlf":("xlf", rungrep), "xlff":("xlff", rungrep), 324 None:("po", rungrep)} 325 parser = GrepOptionParser(formats) 326 parser.add_option("", "--search", dest="searchparts", 327 action="append", type="choice", choices=["source", "target", "notes", "locations", "msgid", "msgstr", "comment" ], 328 metavar="SEARCHPARTS", help="searches the given parts (source, target, notes and locations)") 329 parser.add_option("-I", "--ignore-case", dest="ignorecase", 330 action="store_true", default=False, help="ignore case distinctions") 331 parser.add_option("-e", "--regexp", dest="useregexp", 332 action="store_true", default=False, help="use regular expression matching") 333 parser.add_option("-v", "--invert-match", dest="invertmatch", 334 action="store_true", default=False, help="select non-matching lines") 335 parser.add_option("", "--accelerator", dest="accelchar", 336 action="store", type="choice", choices=["&", "_", "~"], 337 metavar="ACCELERATOR", help="ignores the given accelerator when matching") 338 parser.add_option("", "--header", dest="includeheader", 339 action="store_true", default=False, 340 help="include a PO header in the output") 341 parser.set_usage() 342 parser.passthrough.append('checkfilter') 343 parser.description = __doc__ 344 return parser
345
346 -def main():
347 parser = cmdlineparser() 348 parser.run()
349 350 if __name__ == '__main__': 351 main() 352