Package translate :: Package search :: Package indexing :: Module CommonIndexer
[hide private]
[frames] | no frames]

Source Code for Module translate.search.indexing.CommonIndexer

  1  # -*- coding: utf-8 -*- 
  2  # 
  3  # Copyright 2008 Zuza Software Foundation 
  4  # 
  5  # This file is part of translate. 
  6  # 
  7  # translate is free software; you can redistribute it and/or modify 
  8  # it under the terms of the GNU General Public License as published by 
  9  # the Free Software Foundation; either version 2 of the License, or 
 10  # (at your option) any later version. 
 11  # 
 12  # translate is distributed in the hope that it will be useful, 
 13  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 14  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 15  # GNU General Public License for more details. 
 16  # 
 17  # You should have received a copy of the GNU General Public License 
 18  # along with translate; if not, write to the Free Software 
 19  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 20  # 
 21   
 22   
 23  """ 
 24  base class for interfaces to indexing engines for pootle 
 25  """ 
 26   
 27  import os 
 28   
 29  import translate.lang.data 
 30   
 31  __revision__ = "$Id: CommonIndexer.py 15615 2010-08-22 21:13:42Z dwaynebailey $" 
 32   
 33   
34 -def is_available():
35 """check if this indexing engine interface is usable 36 37 this function must exist in every module that contains indexing engine 38 interfaces 39 40 @return: is this interface usable? 41 @rtype: bool 42 """ 43 return False
44 45
46 -class CommonDatabase(object):
47 """base class for indexing support 48 49 any real implementation must override most methods of this class 50 """ 51 52 field_analyzers = {} 53 """mapping of field names and analyzers - see 'set_field_analyzers'""" 54 55 ANALYZER_EXACT = 0 56 """exact matching: the query string must equal the whole term string""" 57 58 ANALYZER_PARTIAL = 1 << 1 59 """partial matching: a document matches, even if the query string only 60 matches the beginning of the term value.""" 61 62 ANALYZER_TOKENIZE = 1 << 2 63 """tokenize terms and queries automatically""" 64 65 ANALYZER_DEFAULT = ANALYZER_TOKENIZE | ANALYZER_PARTIAL 66 """the default analyzer to be used if nothing is configured""" 67 68 QUERY_TYPE = None 69 """override this with the query class of the implementation""" 70 71 INDEX_DIRECTORY_NAME = None 72 """override this with a string to be used as the name of the indexing 73 directory/file in the filesystem 74 """ 75
76 - def __init__(self, basedir, analyzer=None, create_allowed=True):
77 """initialize or open an indexing database 78 79 Any derived class must override __init__. 80 81 Any implementation can rely on the "self.location" attribute to be set 82 by the __init__ function of the super class. 83 84 @raise ValueError: the given location exists, but the database type 85 is incompatible (e.g. created by a different indexing engine) 86 @raise OSError: the database failed to initialize 87 88 @param basedir: the parent directory of the database 89 @type basedir: str 90 @param analyzer: bitwise combination of possible analyzer flags 91 to be used as the default analyzer for this database. Leave it empty 92 to use the system default analyzer (self.ANALYZER_DEFAULT). 93 see self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ... 94 @type analyzer: int 95 @param create_allowed: create the database, if necessary; default: True 96 @type create_allowed: bool 97 """ 98 # just do some checks 99 if self.QUERY_TYPE is None: 100 raise NotImplementedError("Incomplete indexer implementation: " \ 101 + "'QUERY_TYPE' is undefined") 102 if self.INDEX_DIRECTORY_NAME is None: 103 raise NotImplementedError("Incomplete indexer implementation: " \ 104 + "'INDEX_DIRECTORY_NAME' is undefined") 105 self.location = os.path.join(basedir, self.INDEX_DIRECTORY_NAME) 106 if (not create_allowed) and (not os.path.exists(self.location)): 107 raise OSError("Indexer: the database does not exist - and I am" \ 108 + " not configured to create it.") 109 if analyzer is None: 110 self.analyzer = self.ANALYZER_DEFAULT 111 else: 112 self.analyzer = analyzer 113 self.field_analyzers = {}
114
115 - def flush(self, optimize=False):
116 """flush the content of the database - to force changes to be written 117 to disk 118 119 some databases also support index optimization 120 121 @param optimize: should the index be optimized if possible? 122 @type optimize: bool 123 """ 124 raise NotImplementedError("Incomplete indexer implementation: " \ 125 + "'flush' is missing")
126
127 - def make_query(self, args, require_all=True, analyzer=None):
128 """create simple queries (strings or field searches) or 129 combine multiple queries (AND/OR) 130 131 To specifiy rules for field searches, you may want to take a look at 132 'set_field_analyzers'. The parameter 'match_text_partial' can override 133 the previously defined default setting. 134 135 @param args: queries or search string or description of field query 136 examples:: 137 [xapian.Query("foo"), xapian.Query("bar")] 138 xapian.Query("foo") 139 "bar" 140 {"foo": "bar", "foobar": "foo"} 141 @type args: list of queries | single query | str | dict 142 @param require_all: boolean operator 143 (True -> AND (default) / False -> OR) 144 @type require_all: boolean 145 @param analyzer: (only applicable for 'dict' or 'str') 146 Define query options (partial matching, exact matching, tokenizing, 147 ...) as bitwise combinations of CommonIndexer.ANALYZER_???. 148 This can override previously defined field analyzer settings. 149 If analyzer is None (default), then the configured analyzer for the 150 field is used. 151 @type analyzer: int 152 @return: the combined query 153 @rtype: query type of the specific implemention 154 """ 155 # turn a dict into a list if necessary 156 if isinstance(args, dict): 157 args = args.items() 158 # turn 'args' into a list if necessary 159 if not isinstance(args, list): 160 args = [args] 161 # combine all given queries 162 result = [] 163 for query in args: 164 # just add precompiled queries 165 if isinstance(query, self.QUERY_TYPE): 166 result.append(self._create_query_for_query(query)) 167 # create field/value queries out of a tuple 168 elif isinstance(query, tuple): 169 field, value = query 170 # perform unicode normalization 171 field = translate.lang.data.normalize(unicode(field)) 172 value = translate.lang.data.normalize(unicode(value)) 173 # check for the choosen match type 174 if analyzer is None: 175 analyzer = self.get_field_analyzers(field) 176 result.append(self._create_query_for_field(field, value, 177 analyzer=analyzer)) 178 # parse plaintext queries 179 elif isinstance(query, basestring): 180 if analyzer is None: 181 analyzer = self.analyzer 182 # perform unicode normalization 183 query = translate.lang.data.normalize(unicode(query)) 184 result.append(self._create_query_for_string(query, 185 require_all=require_all, analyzer=analyzer)) 186 else: 187 # other types of queries are not supported 188 raise ValueError("Unable to handle query type: %s" \ 189 % str(type(query))) 190 # return the combined query 191 return self._create_query_combined(result, require_all)
192
193 - def _create_query_for_query(self, query):
194 """generate a query based on an existing query object 195 196 basically this function should just create a copy of the original 197 198 @param query: the original query object 199 @type query: xapian.Query 200 @return: the resulting query object 201 @rtype: xapian.Query | PyLucene.Query 202 """ 203 raise NotImplementedError("Incomplete indexer implementation: " \ 204 + "'_create_query_for_query' is missing")
205
206 - def _create_query_for_string(self, text, require_all=True, 207 analyzer=None):
208 """generate a query for a plain term of a string query 209 210 basically this function parses the string and returns the resulting 211 query 212 213 @param text: the query string 214 @type text: str 215 @param require_all: boolean operator 216 (True -> AND (default) / False -> OR) 217 @type require_all: bool 218 @param analyzer: Define query options (partial matching, exact matching, 219 tokenizing, ...) as bitwise combinations of 220 CommonIndexer.ANALYZER_???. 221 This can override previously defined field analyzer settings. 222 If analyzer is None (default), then the configured analyzer for the 223 field is used. 224 @type analyzer: int 225 @return: resulting query object 226 @rtype: xapian.Query | PyLucene.Query 227 """ 228 raise NotImplementedError("Incomplete indexer implementation: " \ 229 + "'_create_query_for_string' is missing")
230
231 - def _create_query_for_field(self, field, value, analyzer=None):
232 """generate a field query 233 234 this functions creates a field->value query 235 236 @param field: the fieldname to be used 237 @type field: str 238 @param value: the wanted value of the field 239 @type value: str 240 @param analyzer: Define query options (partial matching, exact matching, 241 tokenizing, ...) as bitwise combinations of 242 CommonIndexer.ANALYZER_???. 243 This can override previously defined field analyzer settings. 244 If analyzer is None (default), then the configured analyzer for the 245 field is used. 246 @type analyzer: int 247 @return: resulting query object 248 @rtype: xapian.Query | PyLucene.Query 249 """ 250 raise NotImplementedError("Incomplete indexer implementation: " \ 251 + "'_create_query_for_field' is missing")
252
253 - def _create_query_combined(self, queries, require_all=True):
254 """generate a combined query 255 256 @param queries: list of the original queries 257 @type queries: list of xapian.Query 258 @param require_all: boolean operator 259 (True -> AND (default) / False -> OR) 260 @type require_all: bool 261 @return: the resulting combined query object 262 @rtype: xapian.Query | PyLucene.Query 263 """ 264 raise NotImplementedError("Incomplete indexer implementation: " \ 265 + "'_create_query_combined' is missing")
266
267 - def index_document(self, data):
268 """add the given data to the database 269 270 @param data: the data to be indexed. 271 A dictionary will be treated as fieldname:value combinations. 272 If the fieldname is None then the value will be interpreted as a 273 plain term or as a list of plain terms. 274 Lists of terms are indexed separately. 275 Lists of strings are treated as plain terms. 276 @type data: dict | list of str 277 """ 278 doc = self._create_empty_document() 279 if isinstance(data, dict): 280 data = data.items() 281 # add all data 282 for dataset in data: 283 if isinstance(dataset, tuple): 284 # the dataset tuple consists of '(key, value)' 285 key, value = dataset 286 if key is None: 287 if isinstance(value, list): 288 terms = value[:] 289 elif isinstance(value, basestring): 290 terms = [value] 291 else: 292 raise ValueError("Invalid data type to be indexed: %s" \ 293 % str(type(data))) 294 for one_term in terms: 295 self._add_plain_term(doc, self._decode(one_term), 296 (self.ANALYZER_DEFAULT & self.ANALYZER_TOKENIZE > 0)) 297 else: 298 analyze_settings = self.get_field_analyzers(key) 299 # handle multiple terms 300 if not isinstance(value, list): 301 value = [value] 302 for one_term in value: 303 self._add_field_term(doc, key, self._decode(one_term), 304 (analyze_settings & self.ANALYZER_TOKENIZE > 0)) 305 elif isinstance(dataset, basestring): 306 self._add_plain_term(doc, self._decode(dataset), 307 (self.ANALYZER_DEFAULT & self.ANALYZER_TOKENIZE > 0)) 308 else: 309 raise ValueError("Invalid data type to be indexed: %s" \ 310 % str(type(data))) 311 self._add_document_to_index(doc)
312
313 - def _create_empty_document(self):
314 """create an empty document to be filled and added to the index later 315 316 @return: the new document object 317 @rtype: xapian.Document | PyLucene.Document 318 """ 319 raise NotImplementedError("Incomplete indexer implementation: " \ 320 + "'_create_empty_document' is missing")
321
322 - def _add_plain_term(self, document, term, tokenize=True):
323 """add a term to a document 324 325 @param document: the document to be changed 326 @type document: xapian.Document | PyLucene.Document 327 @param term: a single term to be added 328 @type term: str 329 @param tokenize: should the term be tokenized automatically 330 @type tokenize: bool 331 """ 332 raise NotImplementedError("Incomplete indexer implementation: " \ 333 + "'_add_plain_term' is missing")
334
335 - def _add_field_term(self, document, field, term, tokenize=True):
336 """add a field term to a document 337 338 @param document: the document to be changed 339 @type document: xapian.Document | PyLucene.Document 340 @param field: name of the field 341 @type field: str 342 @param term: term to be associated to the field 343 @type term: str 344 @param tokenize: should the term be tokenized automatically 345 @type tokenize: bool 346 """ 347 raise NotImplementedError("Incomplete indexer implementation: " \ 348 + "'_add_field_term' is missing")
349
350 - def _add_document_to_index(self, document):
351 """add a prepared document to the index database 352 353 @param document: the document to be added 354 @type document: xapian.Document | PyLucene.Document 355 """ 356 raise NotImplementedError("Incomplete indexer implementation: " \ 357 + "'_add_document_to_index' is missing")
358
359 - def begin_transaction(self):
360 """begin a transaction 361 362 You can group multiple modifications of a database as a transaction. 363 This prevents time-consuming database flushing and helps, if you want 364 that a changeset is committed either completely or not at all. 365 No changes will be written to disk until 'commit_transaction'. 366 'cancel_transaction' can be used to revert an ongoing transaction. 367 368 Database types that do not support transactions may silently ignore it. 369 """ 370 raise NotImplementedError("Incomplete indexer implementation: " \ 371 + "'begin_transaction' is missing")
372
373 - def cancel_transaction(self):
374 """cancel an ongoing transaction 375 376 See 'start_transaction' for details. 377 """ 378 raise NotImplementedError("Incomplete indexer implementation: " \ 379 + "'cancel_transaction' is missing")
380
381 - def commit_transaction(self):
382 """submit the currently ongoing transaction and write changes to disk 383 384 See 'start_transaction' for details. 385 """ 386 raise NotImplementedError("Incomplete indexer implementation: " \ 387 + "'commit_transaction' is missing")
388
389 - def get_query_result(self, query):
390 """return an object containing the results of a query 391 392 @param query: a pre-compiled query 393 @type query: a query object of the real implementation 394 @return: an object that allows access to the results 395 @rtype: subclass of CommonEnquire 396 """ 397 raise NotImplementedError("Incomplete indexer implementation: " \ 398 + "'get_query_result' is missing")
399
400 - def delete_document_by_id(self, docid):
401 """delete a specified document 402 403 @param docid: the document ID to be deleted 404 @type docid: int 405 """ 406 raise NotImplementedError("Incomplete indexer implementation: " \ 407 + "'delete_document_by_id' is missing")
408
409 - def search(self, query, fieldnames):
410 """return a list of the contents of specified fields for all matches of 411 a query 412 413 @param query: the query to be issued 414 @type query: a query object of the real implementation 415 @param fieldnames: the name(s) of a field of the document content 416 @type fieldnames: string | list of strings 417 @return: a list of dicts containing the specified field(s) 418 @rtype: list of dicts 419 """ 420 raise NotImplementedError("Incomplete indexer implementation: " \ 421 + "'search' is missing")
422
423 - def delete_doc(self, ident):
424 """delete the documents returned by a query 425 426 @param ident: [list of] document IDs | dict describing a query | query 427 @type ident: int | list of tuples | dict | list of dicts | 428 query (e.g. xapian.Query) | list of queries 429 """ 430 # turn a doc-ID into a list of doc-IDs 431 if isinstance(ident, list): 432 # it is already a list 433 ident_list = ident 434 else: 435 ident_list = [ident] 436 if len(ident_list) == 0: 437 # no matching items 438 return 0 439 if isinstance(ident_list[0], int): 440 # create a list of IDs of all successfully removed documents 441 success_delete = [match for match in ident_list 442 if self.delete_document_by_id(match)] 443 return len(success_delete) 444 if isinstance(ident_list[0], dict): 445 # something like: { "msgid": "foobar" } 446 # assemble all queries 447 query = self.make_query([self.make_query(query_dict, 448 require_all=True) for query_dict in ident_list], 449 require_all=True) 450 elif isinstance(ident_list[0], object): 451 # assume a query object (with 'AND') 452 query = self.make_query(ident_list, require_all=True) 453 else: 454 # invalid element type in list (not necessarily caught in the 455 # lines above) 456 raise TypeError("description of documents to-be-deleted is not " \ 457 + "supported: list of %s" % type(ident_list[0])) 458 # we successfully created a query - now iterate through the result 459 # no documents deleted so far ... 460 remove_list = [] 461 # delete all resulting documents step by step 462 463 def add_docid_to_list(match): 464 """collect every document ID""" 465 remove_list.append(match["docid"])
466 self._walk_matches(query, add_docid_to_list) 467 return self.delete_doc(remove_list)
468
469 - def _walk_matches(self, query, function, arg_for_function=None):
470 """use this function if you want to do something with every single match 471 of a query 472 473 example:: 474 self._walk_matches(query, function_for_match, arg_for_func) 475 'function_for_match' expects only one argument: the matched object 476 477 @param query: a query object of the real implementation 478 @type query: xapian.Query | PyLucene.Query 479 @param function: the function to execute with every match 480 @type function: function 481 @param arg_for_function: an optional argument for the function 482 @type arg_for_function: anything 483 """ 484 # execute the query 485 enquire = self.get_query_result(query) 486 # start with the first element 487 start = 0 488 # do the loop at least once 489 size, avail = (0, 1) 490 # how many results per 'get_matches'? 491 steps = 2 492 while start < avail: 493 (size, avail, matches) = enquire.get_matches(start, steps) 494 for match in matches: 495 if arg_for_function is None: 496 function(match) 497 else: 498 function(match, arg_for_function) 499 start += size
500
501 - def set_field_analyzers(self, field_analyzers):
502 """set the analyzers for different fields of the database documents 503 504 All bitwise combinations of CommonIndexer.ANALYZER_??? are possible. 505 506 @param field_analyzers: mapping of field names and analyzers 507 @type field_analyzers: dict containing field names and analyzers 508 @raise TypeError: invalid values in 'field_analyzers' 509 """ 510 for field, analyzer in field_analyzers.items(): 511 # check for invald input types 512 if not isinstance(field, (str, unicode)): 513 raise TypeError("field name must be a string") 514 if not isinstance(analyzer, int): 515 raise TypeError("the analyzer must be a whole number (int)") 516 # map the analyzer to the field name 517 self.field_analyzers[field] = analyzer
518
519 - def get_field_analyzers(self, fieldnames=None):
520 """return the analyzer that was mapped to a specific field 521 522 see 'set_field_analyzers' for details 523 524 @param fieldnames: the analyzer of this field (or all/multiple fields) 525 is requested; leave empty (or "None") to request all fields 526 @type fieldnames: str | list of str | None 527 @return: the analyzer setting of the field - see 528 CommonDatabase.ANALYZER_??? or a dict of field names and analyzers 529 @rtype: int | dict 530 """ 531 # all field analyzers are requested 532 if fieldnames is None: 533 # return a copy 534 return dict(self.field_analyzers) 535 # one field is requested 536 if isinstance(fieldnames, (str, unicode)): 537 if fieldnames in self.field_analyzers: 538 return self.field_analyzers[fieldnames] 539 else: 540 return self.analyzer 541 # a list of fields is requested 542 if isinstance(fieldnames, list): 543 result = {} 544 for field in fieldnames: 545 result[field] = self.get_field_analyzers(field) 546 return result 547 return self.analyzer
548
549 - def _decode(self, text):
550 """decode the string from utf-8 or charmap 551 perform unicde normalization 552 """ 553 if isinstance(text, str): 554 try: 555 result = unicode(text.decode("UTF-8")) 556 except UnicodeEncodeError, e: 557 result = unicode(text.decode("charmap")) 558 elif not isinstance(text, unicode): 559 result = unicode(text) 560 else: 561 result = text 562 # perform unicode normalization 563 return translate.lang.data.normalize(result)
564 565
566 -class CommonEnquire(object):
567 """an enquire object contains the information about the result of a request 568 """ 569
570 - def __init__(self, enquire):
571 """intialization of a wrapper around enquires of different backends 572 573 @param enquire: a previous enquire 574 @type enquire: xapian.Enquire | pylucene-enquire 575 """ 576 self.enquire = enquire
577
578 - def get_matches(self, start, number):
579 """return a specified number of qualified matches of a previous query 580 581 @param start: index of the first match to return (starting from zero) 582 @type start: int 583 @param number: the number of matching entries to return 584 @type number: int 585 @return: a set of matching entries and some statistics 586 @rtype: tuple of (returned number, available number, matches) 587 "matches" is a dictionary of:: 588 ["rank", "percent", "document", "docid"] 589 """ 590 raise NotImplementedError("Incomplete indexing implementation: " \ 591 + "'get_matches' for the 'Enquire' class is missing")
592
593 - def get_matches_count(self):
594 """return the estimated number of matches 595 596 use "CommonIndexer.search" to retrieve the exact number of matches 597 @return: the estimaed number of matches 598 @rtype: int 599 """ 600 (returned, estimate_count, matches) = self.get_matches(0, 1) 601 return estimate_count
602