Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYuri Gorshenin <y@maps.me>2016-06-07 18:02:17 +0300
committerYuri Gorshenin <y@maps.me>2016-06-07 18:16:42 +0300
commit47ee541872a3da68105fc5080f2abdc2abe7f57e (patch)
tree595532a6bd090be31a05258c429fd8739aa9eeb8 /search/search_quality
parenteeb41dadb89642c8615f949e51476e3d55a9d332 (diff)
[search] Fixed letter-or-building-part handling.
Diffstat (limited to 'search/search_quality')
-rwxr-xr-xsearch/search_quality/clusterize-tag-values.lisp99
1 files changed, 46 insertions, 53 deletions
diff --git a/search/search_quality/clusterize-tag-values.lisp b/search/search_quality/clusterize-tag-values.lisp
index 4685079ba7..66ec3af0a4 100755
--- a/search/search_quality/clusterize-tag-values.lisp
+++ b/search/search_quality/clusterize-tag-values.lisp
@@ -105,7 +105,7 @@ exec /usr/bin/env sbcl --noinform --quit --eval "(defparameter *script-name* \"$
'("building" "bldg" "bld" "bl" "unit" "block" "blk"
"корпус" "корп" "литер" "лит" "строение" "блок" "бл"))
-(defparameter *house-number-seps* '(#\Space #\. #\( #\) #\# #\~))
+(defparameter *house-number-seps* '(#\Space #\Tab #\" #\\ #\( #\) #\. #\# #\~))
(defparameter *house-number-groups-seps* '(#\, #\| #\; #\+))
(defun building-synonym-p (s)
@@ -124,21 +124,57 @@ exec /usr/bin/env sbcl --noinform --quit --eval "(defparameter *script-name* \"$
((char= c #\/) :slash)
(T :string)))
+(defun transform-string-token (fn value)
+ "Transforms building token value into one or more tokens in
+ accordance to it's value. For example, 'литA' is transformed to
+ tokens 'лит' (building part) and 'А' (letter).
+ "
+ (macrolet ((emit (value type) `(funcall fn ,value ,type)))
+ (cond ((building-synonym-p value)
+ (emit value :building-part))
+ ((and (= 4 (length value))
+ (starts-with value "лит"))
+ (emit (subseq value 0 3) :building-part)
+ (emit (subseq value 3) :letter))
+ ((and (= 2 (length value))
+ (short-building-synonym-p (subseq value 0 1)))
+ (emit (subseq value 0 1) :building-part)
+ (emit (subseq value 1) :letter))
+ ((= 1 (length value))
+ (emit value (if (short-building-synonym-p value)
+ :letter-or-building-part
+ :letter)))
+ (T (emit value :string)))))
+
(defun tokenize-house-number (house-number)
"house-number => [token]"
(let ((parts (group-by #'(lambda (lhs rhs)
(eq (get-char-type lhs) (get-char-type rhs)))
- (string-downcase house-number))))
- (remove-if #'(lambda (token) (eq :separator (token-type token)))
- (mapcar #'(lambda (part) (make-token :value (concatenate 'string part)
- :type (get-char-type (car part))))
- parts))))
+ (string-downcase house-number)))
+ (tokens nil))
+ (flet ((add-token (value type) (push (make-token :value value :type type) tokens)))
+ (dolist (part parts)
+ (let ((value (concatenate 'string part))
+ (type (get-char-type (car part))))
+ (case type
+ (:string (transform-string-token #'add-token value))
+ (:separator T)
+ (otherwise (add-token value type)))))
+ (loop for prev = nil then curr
+ for curr in tokens
+ do (when (eq :letter-or-building-part (token-type curr))
+ (cond ((null prev)
+ (setf (token-type curr) :letter))
+ ((eq :number (token-type prev))
+ (setf (token-type curr) :building-part)))))
+ (reverse tokens))))
(defun house-number-with-optional-suffix-p (tokens)
(case (length tokens)
(1 (eq (token-type (first tokens)) :number))
(2 (and (eq (token-type (first tokens)) :number)
- (eq (token-type (second tokens)) :string)))
+ (or (eq (token-type (second tokens)) :string)
+ (eq (token-type (second tokens)) :letter))))
(otherwise nil)))
(defun get-house-number-sub-numbers (house-number)
@@ -157,48 +193,6 @@ exec /usr/bin/env sbcl --noinform --quit --eval "(defparameter *script-name* \"$
groups
(list tokens))))
-(defun parse-house-number (tokens)
- "[token] => [token]
-
- Parses house number, but as the grammar is undefined and ambiguous,
- the parsing is just a split of some tokens, i.e. 'литА' will be split
- to building synonym (литер) and to letter (A).
- "
- (loop with result = nil
- for token in tokens
- for token-value = (token-value token)
- for token-type = (token-type token)
- do (case token-type
- (:string (cond ((building-synonym-p token-value)
- (push (make-token :value token-value
- :type :building-part)
- result))
- ((and (= 4 (length token-value))
- (starts-with token-value "лит"))
- (push (make-token :value (subseq token-value 0 3)
- :type :building-part)
- result)
- (push (make-token :value (subseq token-value 3)
- :type :letter)
- result))
- ((and (= 2 (length token-value))
- (short-building-synonym-p (subseq token-value 0 1)))
- (push (make-token :value (subseq token-value 0 1)
- :type :building-part)
- result)
- (push (make-token :value (subseq token-value 1)
- :type :letter)
- result))
- ((= 1 (length token-value))
- (push (make-token :value token-value
- :type (if (short-building-synonym-p token-value)
- :letter-or-building-part
- :letter))
- result))
- (T (push token result))))
- (otherwise (push token result)))
- finally (return (reverse result))))
-
(defun join-house-number-tokens (tokens)
"Joins token values with spaces."
(format nil "~{~a~^ ~}" (mapcar #'token-value tokens)))
@@ -228,21 +222,20 @@ exec /usr/bin/env sbcl --noinform --quit --eval "(defparameter *script-name* \"$
(defun get-house-number-pattern (house-number fn)
(dolist (number (get-house-number-sub-numbers house-number))
(let ((house-number (join-house-number-tokens number))
- (pattern (join-house-number-parse (parse-house-number number))))
+ (pattern (join-house-number-parse number)))
(funcall fn house-number pattern))))
(defun get-house-number-prefix-strings (house-number fn)
(dolist (number (get-house-number-sub-numbers house-number))
(dolist (string (mapcar #'token-value
- (take-while #'character-token-p (parse-house-number number))))
+ (take-while #'character-token-p number)))
(funcall fn string string))))
(defun get-house-number-inner-strings (house-number fn)
(dolist (number (get-house-number-sub-numbers house-number))
(dolist (string (mapcar #'token-value
(remove-if-not #'character-token-p
- (drop-while #'character-token-p
- (parse-house-number number)))))
+ (drop-while #'character-token-p number))))
(funcall fn string string))))
(defstruct type-settings