Package translate :: Package storage :: Module dtd
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.dtd

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2002-2006 Zuza Software Foundation 
  5  # 
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """classes that hold units of .dtd files (dtdunit) or entire files (dtdfile) 
 23  these are specific .dtd files for localisation used by mozilla 
 24   
 25  Specifications 
 26  ============== 
 27  The following information is provided by Mozilla:: 
 28   
 29  *  U{Specification<http://www.w3.org/TR/REC-xml/#sec-entexpand>} 
 30   
 31  There is a grammar for entity definitions, which isn't really precise, 
 32  as the spec says.  There's no formal specification for DTD files, it's 
 33  just "whatever makes this work" basically. The whole piece is clearly not 
 34  the strongest point of the xml spec 
 35   
 36  XML elements are allowed in entity values. A number of things that are 
 37  allowed will just break the resulting document, Mozilla forbids these 
 38  in their DTD parser. 
 39  """ 
 40   
 41  from translate.storage import base 
 42  from translate.misc import quote 
 43   
 44  import re 
 45  import warnings 
 46  try: 
 47      from lxml import etree 
 48      import StringIO 
 49  except ImportError: 
 50      etree = None 
 51   
 52  labelsuffixes = (".label", ".title") 
 53  """Label suffixes: entries with this suffix are able to be comibed with accesskeys 
 54  found in in entries ending with L{accesskeysuffixes}""" 
 55  accesskeysuffixes = (".accesskey", ".accessKey", ".akey") 
 56  """Accesskey Suffixes: entries with this suffix may be combined with labels 
 57  ending in L{labelsuffixes} into accelerator notation""" 
 58   
 59   
60 -def quotefordtd(source):
61 if '"' in source: 62 if "'" in source: 63 return "'" + source.replace("'", '&apos;') + "'" 64 else: 65 return quote.singlequotestr(source) 66 else: 67 return quote.quotestr(source)
68 69
70 -def unquotefromdtd(source):
71 """unquotes a quoted dtd definition""" 72 # extract the string, get rid of quoting 73 if len(source) == 0: 74 source = '""' 75 quotechar = source[0] 76 extracted, quotefinished = quote.extractwithoutquotes(source, quotechar, quotechar, allowreentry=False) 77 if quotechar == "'" and "&apos;" in extracted: 78 extracted = extracted.replace("&apos;", "'") 79 # the quote characters should be the first and last characters in the string 80 # of course there could also be quote characters within the string; not handled here 81 return extracted
82 83
84 -def removeinvalidamps(name, value):
85 """Find and remove ampersands that are not part of an entity definition. 86 87 A stray & in a DTD file can break an applications ability to parse the file. In Mozilla 88 localisation this is very important and these can break the parsing of files used in XUL 89 and thus break interface rendering. Tracking down the problem is very difficult, 90 thus by removing potential broken & and warning the users we can ensure that the output 91 DTD will always be parsable. 92 93 @type name: String 94 @param name: Entity name 95 @type value: String 96 @param value: Entity text value 97 @rtype: String 98 @return: Entity value without bad ampersands 99 """ 100 101 def is_valid_entity_name(name): 102 """Check that supplied L{name} is a valid entity name""" 103 if name.replace('.', '').isalnum(): 104 return True 105 elif name[0] == '#' and name[1:].isalnum(): 106 return True 107 return False
108 109 amppos = 0 110 invalid_amps = [] 111 while amppos >= 0: 112 amppos = value.find("&", amppos) 113 if amppos != -1: 114 amppos += 1 115 semipos = value.find(";", amppos) 116 if semipos != -1: 117 if is_valid_entity_name(value[amppos:semipos]): 118 continue 119 invalid_amps.append(amppos-1) 120 if len(invalid_amps) > 0: 121 warnings.warn("invalid ampersands in dtd entity %s" % (name)) 122 adjustment = 0 123 for amppos in invalid_amps: 124 value = value[:amppos-adjustment] + value[amppos-adjustment+1:] 125 adjustment += 1 126 return value 127 128
129 -class dtdunit(base.TranslationUnit):
130 """this class represents an entity definition from a dtd file (and possibly associated comments)""" 131
132 - def __init__(self, source=""):
133 """construct the dtdunit, prepare it for parsing""" 134 super(dtdunit, self).__init__(source) 135 self.comments = [] 136 self.unparsedlines = [] 137 self.incomment = False 138 self.inentity = False 139 self.entity = "FakeEntityOnlyForInitialisationAndTesting" 140 self.source = source 141 self.space_pre_entity = ' ' 142 self.space_pre_definition = ' '
143 144 # Note that source and target are equivalent for monolingual units
145 - def setsource(self, source):
146 """Sets the definition to the quoted value of source""" 147 self.definition = quotefordtd(source) 148 self._rich_source = None
149
150 - def getsource(self):
151 """gets the unquoted source string""" 152 return unquotefromdtd(self.definition)
153 source = property(getsource, setsource) 154
155 - def settarget(self, target):
156 """Sets the definition to the quoted value of target""" 157 if target is None: 158 target = "" 159 self.definition = quotefordtd(target) 160 self._rich_target = None
161
162 - def gettarget(self):
163 """gets the unquoted target string""" 164 return unquotefromdtd(self.definition)
165 target = property(gettarget, settarget) 166
167 - def isnull(self):
168 """returns whether this dtdunit doesn't actually have an entity definition""" 169 # for dtds, we currently return a blank string if there is no .entity (==location in other files) 170 # TODO: this needs to work better with base class expectations 171 return self.entity is None
172
173 - def parse(self, dtdsrc):
174 """read the first dtd element from the source code into this object, return linesprocessed""" 175 self.comments = [] 176 # make all the lists the same 177 self.locfilenotes = self.comments 178 self.locgroupstarts = self.comments 179 self.locgroupends = self.comments 180 self.locnotes = self.comments 181 # self.locfilenotes = [] 182 # self.locgroupstarts = [] 183 # self.locgroupends = [] 184 # self.locnotes = [] 185 # self.comments = [] 186 self.entity = None 187 self.definition = '' 188 if not dtdsrc: 189 return 0 190 lines = dtdsrc.split("\n") 191 linesprocessed = 0 192 comment = "" 193 for line in lines: 194 line += "\n" 195 linesprocessed += 1 196 # print "line(%d,%d): " % (self.incomment,self.inentity),line[:-1] 197 if not self.incomment: 198 if (line.find('<!--') != -1): 199 self.incomment = True 200 self.continuecomment = False 201 # now work out the type of comment, and save it (remember we're not in the comment yet) 202 (comment, dummy) = quote.extract(line, "<!--", "-->", None, 0) 203 if comment.find('LOCALIZATION NOTE') != -1: 204 l = quote.findend(comment, 'LOCALIZATION NOTE') 205 while (comment[l] == ' '): 206 l += 1 207 if comment.find('FILE', l) == l: 208 self.commenttype = "locfile" 209 elif comment.find('BEGIN', l) == l: 210 self.commenttype = "locgroupstart" 211 elif comment.find('END', l) == l: 212 self.commenttype = "locgroupend" 213 else: 214 self.commenttype = "locnote" 215 else: 216 # plain comment 217 self.commenttype = "comment" 218 #FIXME: bloody entity might share a line with something important 219 elif not self.inentity and re.search("%.*;", line): 220 # now work out the type of comment, and save it (remember we're not in the comment yet) 221 self.comments.append(("comment", line)) 222 line = "" 223 continue 224 225 if self.incomment: 226 # some kind of comment 227 (comment, self.incomment) = quote.extract(line, "<!--", "-->", None, self.continuecomment) 228 # print "comment(%d,%d): " % (self.incomment,self.continuecomment),comment 229 self.continuecomment = self.incomment 230 # strip the comment out of what will be parsed 231 line = line.replace(comment, "", 1) 232 # add a end of line of this is the end of the comment 233 if not self.incomment: 234 if line.isspace(): 235 comment += line 236 line = '' 237 else: 238 comment += '\n' 239 # check if there's actually an entity definition that's commented out 240 # TODO: parse these, store as obsolete messages 241 # if comment.find('<!ENTITY') != -1: 242 # # remove the entity from the comment 243 # comment, dummy = quote.extractwithoutquotes(comment, ">", "<!ENTITY", None, 1) 244 # depending on the type of comment (worked out at the start), put it in the right place 245 # make it record the comment and type as a tuple 246 commentpair = (self.commenttype, comment) 247 if self.commenttype == "locfile": 248 self.locfilenotes.append(commentpair) 249 elif self.commenttype == "locgroupstart": 250 self.locgroupstarts.append(commentpair) 251 elif self.commenttype == "locgroupend": 252 self.locgroupends.append(commentpair) 253 elif self.commenttype == "locnote": 254 self.locnotes.append(commentpair) 255 elif self.commenttype == "comment": 256 self.comments.append(commentpair) 257 258 if not self.inentity and not self.incomment: 259 entitypos = line.find('<!ENTITY') 260 if entitypos != -1: 261 self.inentity = True 262 beforeentity = line[:entitypos].strip() 263 if beforeentity.startswith("#"): 264 self.hashprefix = beforeentity 265 self.entitypart = "start" 266 else: 267 self.unparsedlines.append(line) 268 269 if self.inentity: 270 if self.entitypart == "start": 271 # the entity definition 272 e = quote.findend(line, '<!ENTITY') 273 line = line[e:] 274 self.entitypart = "name" 275 self.entitytype = "internal" 276 if self.entitypart == "name": 277 s = 0 278 e = 0 279 while (e < len(line) and line[e].isspace()): 280 e += 1 281 self.space_pre_entity = ' ' * (e - s) 282 s = e 283 self.entity = '' 284 if (e < len(line) and line[e] == '%'): 285 self.entitytype = "external" 286 self.entityparameter = "" 287 e += 1 288 while (e < len(line) and line[e].isspace()): 289 e += 1 290 while (e < len(line) and not line[e].isspace()): 291 self.entity += line[e] 292 e += 1 293 s = e 294 while (e < len(line) and line[e].isspace()): 295 e += 1 296 self.space_pre_definition = ' ' * (e - s) 297 if self.entity: 298 if self.entitytype == "external": 299 self.entitypart = "parameter" 300 else: 301 self.entitypart = "definition" 302 # remember the start position and the quote character 303 if e == len(line): 304 self.entityhelp = None 305 e = 0 306 continue 307 elif self.entitypart == "definition": 308 self.entityhelp = (e, line[e]) 309 self.instring = False 310 if self.entitypart == "parameter": 311 while (e < len(line) and line[e].isspace()): 312 e += 1 313 paramstart = e 314 while (e < len(line) and line[e].isalnum()): 315 e += 1 316 self.entityparameter += line[paramstart:e] 317 while (e < len(line) and line[e].isspace()): 318 e += 1 319 line = line[e:] 320 e = 0 321 if not line: 322 continue 323 if line[0] in ('"', "'"): 324 self.entitypart = "definition" 325 self.entityhelp = (e, line[e]) 326 self.instring = False 327 if self.entitypart == "definition": 328 if self.entityhelp is None: 329 e = 0 330 while (e < len(line) and line[e].isspace()): 331 e += 1 332 if e == len(line): 333 continue 334 self.entityhelp = (e, line[e]) 335 self.instring = False 336 # actually the lines below should remember instring, rather than using it as dummy 337 e = self.entityhelp[0] 338 if (self.entityhelp[1] == "'"): 339 (defpart, self.instring) = quote.extract(line[e:], "'", "'", startinstring=self.instring, allowreentry=False) 340 elif (self.entityhelp[1] == '"'): 341 (defpart, self.instring) = quote.extract(line[e:], '"', '"', startinstring=self.instring, allowreentry=False) 342 else: 343 raise ValueError("Unexpected quote character... %r" % (self.entityhelp[1])) 344 # for any following lines, start at the beginning of the line. remember the quote character 345 self.entityhelp = (0, self.entityhelp[1]) 346 self.definition += defpart 347 if not self.instring: 348 self.inentity = False 349 break 350 351 # uncomment this line to debug processing 352 if 0: 353 for attr in dir(self): 354 r = repr(getattr(self, attr)) 355 if len(r) > 60: 356 r = r[:57] + "..." 357 self.comments.append(("comment", "self.%s = %s" % (attr, r))) 358 return linesprocessed
359
360 - def __str__(self):
361 """convert to a string. double check that unicode is handled somehow here""" 362 source = self.getoutput() 363 if isinstance(source, unicode): 364 return source.encode(getattr(self, "encoding", "UTF-8")) 365 return source
366
367 - def getoutput(self):
368 """convert the dtd entity back to string form""" 369 lines = [] 370 lines.extend([comment for commenttype, comment in self.comments]) 371 lines.extend(self.unparsedlines) 372 if self.isnull(): 373 result = "".join(lines) 374 return result.rstrip() + "\n" 375 # for f in self.locfilenotes: yield f 376 # for ge in self.locgroupends: yield ge 377 # for gs in self.locgroupstarts: yield gs 378 # for n in self.locnotes: yield n 379 if len(self.entity) > 0: 380 if getattr(self, 'entitytype', None) == 'external': 381 entityline = '<!ENTITY % ' + self.entity + ' ' + self.entityparameter + ' ' + self.definition+'>' 382 else: 383 entityline = '<!ENTITY' + self.space_pre_entity + self.entity + self.space_pre_definition + self.definition + '>' 384 if getattr(self, 'hashprefix', None): 385 entityline = self.hashprefix + " " + entityline 386 if isinstance(entityline, unicode): 387 entityline = entityline.encode('UTF-8') 388 lines.append(entityline + '\n') 389 return "".join(lines)
390 391
392 -class dtdfile(base.TranslationStore):
393 """this class represents a .dtd file, made up of dtdunits""" 394 UnitClass = dtdunit 395
396 - def __init__(self, inputfile=None):
397 """construct a dtdfile, optionally reading in from inputfile""" 398 base.TranslationStore.__init__(self, unitclass=self.UnitClass) 399 self.filename = getattr(inputfile, 'name', '') 400 if inputfile is not None: 401 dtdsrc = inputfile.read() 402 self.parse(dtdsrc) 403 self.makeindex()
404
405 - def parse(self, dtdsrc):
406 """read the source code of a dtd file in and include them as dtdunits in self.units""" 407 start = 0 408 end = 0 409 lines = dtdsrc.split("\n") 410 while end < len(lines): 411 if (start == end): 412 end += 1 413 foundentity = False 414 while end < len(lines): 415 if end >= len(lines): 416 break 417 if lines[end].find('<!ENTITY') > -1: 418 foundentity = True 419 if foundentity and re.match("[\"']\s*>", lines[end]): 420 end += 1 421 break 422 end += 1 423 # print "processing from %d to %d" % (start,end) 424 425 linesprocessed = 1 # to initialise loop 426 while linesprocessed >= 1: 427 newdtd = dtdunit() 428 try: 429 linesprocessed = newdtd.parse("\n".join(lines[start:end])) 430 if linesprocessed >= 1 and (not newdtd.isnull() or newdtd.unparsedlines): 431 self.units.append(newdtd) 432 except Exception, e: 433 warnings.warn("%s\nError occured between lines %d and %d:\n%s" % (e, start+1, end, "\n".join(lines[start:end]))) 434 start += linesprocessed
435
436 - def __str__(self):
437 """convert to a string. double check that unicode is handled somehow here""" 438 source = self.getoutput() 439 if not self._valid_store(): 440 warnings.warn("DTD file '%s' does not validate" % self.filename) 441 return None 442 if isinstance(source, unicode): 443 return source.encode(getattr(self, "encoding", "UTF-8")) 444 return source
445
446 - def getoutput(self):
447 """convert the units back to source""" 448 sources = [str(dtd) for dtd in self.units] 449 return "".join(sources)
450
451 - def makeindex(self):
452 """makes self.index dictionary keyed on entities""" 453 self.index = {} 454 for dtd in self.units: 455 if not dtd.isnull(): 456 self.index[dtd.entity] = dtd
457
458 - def _valid_store(self):
459 """Validate the store to determine if it is valid 460 461 This uses ElementTree to parse the DTD 462 463 @return: If the store passes validation 464 @rtype: Boolean 465 """ 466 if etree is not None: 467 try: 468 # #expand is a Mozilla hack and are removed as they are not valid in DTDs 469 dtd = etree.DTD(StringIO.StringIO(re.sub("#expand", "", self.getoutput()))) 470 except etree.DTDParseError: 471 return False 472 return True
473