diff --git a/wordninja.py b/wordninja.py index 4d7ce15..4000109 100644 --- a/wordninja.py +++ b/wordninja.py @@ -40,8 +40,10 @@ def split(self, s): texts = _SPLIT_RE.split(s) assert len(punctuations) + 1 == len(texts) new_texts = [self._split(x) for x in texts] - for i, punctuation in enumerate(punctuations): - new_texts.insert(2*i+1, punctuation) + # this just seems to add spaces (after PR #13) + the new delimiters in the regex to the result array, if they are already in the input string + # prior to PR # 13, it seems like it would add back anything in [^a-zA-Z0-9']+ + # for i, punctuation in enumerate(punctuations): + # new_texts.insert(2*i+1, punctuation) return [item for sublist in new_texts for item in sublist] @@ -83,9 +85,7 @@ def best_match(i): return reversed(out) DEFAULT_LANGUAGE_MODEL = LanguageModel(os.path.join(os.path.dirname(os.path.abspath(__file__)),'wordninja','wordninja_words.txt.gz')) -_SPLIT_RE = re.compile(r"\s+") +_SPLIT_RE = re.compile(r"\/+|\\+|_+|-+|\s+") def split(s): - return DEFAULT_LANGUAGE_MODEL.split(s) - - + return DEFAULT_LANGUAGE_MODEL.split(s) \ No newline at end of file