/* * call-seq: * FuzzyQuery.new(field, term, options = {}) -> fuzzy-query * * Create a new FuzzyQuery that will match terms with a similarity of at * least +:min_similarity+ to +term+. Similarity is scored using the * Levenshtein edit distance formula. See * http://en.wikipedia.org/wiki/Levenshtein_distance * * If a +:prefix_length+ > 0 is specified, a common prefix of that length is * also required. * * You can also set +:max_terms+ to prevent memory overflow problems. By * default it is set to 512. * * == Example * * FuzzyQuery.new(:content, "levenshtein", * :min_similarity => 0.8, * :prefix_length => 5, * :max_terms => 1024) * * field:: field to search * term:: term to search for including it's close matches * :min_similarity:: Default: 0.5. minimum levenshtein distance score for a * match * :prefix_length:: Default: 0. minimum prefix_match before levenshtein * distance is measured. This parameter is used to improve * performance. With a +:prefix_length+ of 0, all terms in * the index must be checked which can be quite a * performance hit. By setting theprefix length to a * larger number you minimize the number of terms that need * to be checked. Even 1 will cut down the work by a * factor of about 26 depending on your character set and * the first letter. * :max_terms:: Limits the number of terms that can be added to the * query when it is expanded as a MultiTermQuery. This is * not usually a problem with FuzzyQueries unless you set * +:min_similarity+ to a very low value. */ static VALUE frt_fq_init(int argc, VALUE *argv, VALUE self) { Query *q; VALUE rfield, rterm, roptions; float min_sim = (float)NUM2DBL(rb_cvar_get(cFuzzyQuery, id_default_min_similarity)); int pre_len = FIX2INT(rb_cvar_get(cFuzzyQuery, id_default_prefix_length)); int max_terms = FIX2INT(rb_cvar_get(cMultiTermQuery, id_default_max_terms)); if (rb_scan_args(argc, argv, "21", &rfield, &rterm, &roptions) >= 3) { VALUE v; Check_Type(roptions, T_HASH); if (Qnil != (v = rb_hash_aref(roptions, sym_prefix_length))) { pre_len = FIX2INT(v); } if (Qnil != (v = rb_hash_aref(roptions, sym_min_similarity))) { min_sim = (float)NUM2DBL(v); } if (Qnil != (v = rb_hash_aref(roptions, sym_max_terms))) { max_terms = FIX2INT(v); } } if (min_sim >= 1.0) { rb_raise(rb_eArgError, "%f >= 1.0. :min_similarity must be < 1.0", min_sim); } else if (min_sim < 0.0) { rb_raise(rb_eArgError, "%f < 0.0. :min_similarity must be > 0.0", min_sim); } if (pre_len < 0) { rb_raise(rb_eArgError, "%d < 0. :prefix_length must be >= 0", pre_len); } if (max_terms < 0) { rb_raise(rb_eArgError, "%d < 0. :max_terms must be >= 0", max_terms); } q = fuzq_new_conf(frt_field(rfield), StringValuePtr(rterm), min_sim, pre_len, max_terms); Frt_Wrap_Struct(self, NULL, &frt_q_free, q); object_add(q, self); return self; }