From ceb3bebf444763d9323ebbac0de8bf9fba604d39 Mon Sep 17 00:00:00 2001 From: Santosh Date: Fri, 17 Feb 2023 23:56:07 -0800 Subject: [PATCH 1/5] fix unit tests --- wordninja.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/wordninja.py b/wordninja.py index 4d7ce15..55f6285 100644 --- a/wordninja.py +++ b/wordninja.py @@ -40,8 +40,9 @@ def split(self, s): texts = _SPLIT_RE.split(s) assert len(punctuations) + 1 == len(texts) new_texts = [self._split(x) for x in texts] - for i, punctuation in enumerate(punctuations): - new_texts.insert(2*i+1, punctuation) + # this just seems to add spaces back to string, if they are already detected? + # for i, punctuation in enumerate(punctuations): + # new_texts.insert(2*i+1, punctuation) return [item for sublist in new_texts for item in sublist] @@ -83,9 +84,7 @@ def best_match(i): return reversed(out) DEFAULT_LANGUAGE_MODEL = LanguageModel(os.path.join(os.path.dirname(os.path.abspath(__file__)),'wordninja','wordninja_words.txt.gz')) -_SPLIT_RE = re.compile(r"\s+") +_SPLIT_RE = re.compile(r"\/+|\\+|_+|-+|\s+") def split(s): - return DEFAULT_LANGUAGE_MODEL.split(s) - - + return DEFAULT_LANGUAGE_MODEL.split(s) \ No newline at end of file From 64e9c4a12667ba6a0b22ed3503d8d84d0574ebc0 Mon Sep 17 00:00:00 2001 From: Santosh Date: Sat, 18 Feb 2023 00:02:08 -0800 Subject: [PATCH 2/5] make comment more helpful --- wordninja.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wordninja.py b/wordninja.py index 55f6285..926370a 100644 --- a/wordninja.py +++ b/wordninja.py @@ -40,7 +40,7 @@ def split(self, s): texts = _SPLIT_RE.split(s) assert len(punctuations) + 1 == len(texts) new_texts = [self._split(x) for x in texts] - # this just seems to add spaces back to string, if they are already detected? + # this just seems to add spaces to the result array, if they are already in the input string? # for i, punctuation in enumerate(punctuations): # new_texts.insert(2*i+1, punctuation) return [item for sublist in new_texts for item in sublist] From 2f840a316b12b7fc945ad0105bd536feb3055528 Mon Sep 17 00:00:00 2001 From: Santosh Date: Sat, 18 Feb 2023 00:22:17 -0800 Subject: [PATCH 3/5] make comment more helpful 2 --- wordninja.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/wordninja.py b/wordninja.py index 926370a..db26948 100644 --- a/wordninja.py +++ b/wordninja.py @@ -40,7 +40,8 @@ def split(self, s): texts = _SPLIT_RE.split(s) assert len(punctuations) + 1 == len(texts) new_texts = [self._split(x) for x in texts] - # this just seems to add spaces to the result array, if they are already in the input string? + # this just seems to add punctuation/spaces (after PR #13) to the result array, if they are already in the input string + # prior to PR # 13, it seems like it would add back anything in [^a-zA-Z0-9']+ # for i, punctuation in enumerate(punctuations): # new_texts.insert(2*i+1, punctuation) return [item for sublist in new_texts for item in sublist] From af701ca5f4dba778c0d81392db8318a95a9a3009 Mon Sep 17 00:00:00 2001 From: Santosh Date: Sat, 18 Feb 2023 00:23:39 -0800 Subject: [PATCH 4/5] make comment more helpful 3 --- wordninja.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wordninja.py b/wordninja.py index db26948..e0aba40 100644 --- a/wordninja.py +++ b/wordninja.py @@ -40,7 +40,7 @@ def split(self, s): texts = _SPLIT_RE.split(s) assert len(punctuations) + 1 == len(texts) new_texts = [self._split(x) for x in texts] - # this just seems to add punctuation/spaces (after PR #13) to the result array, if they are already in the input string + # this just seems to add spaces (after PR #13) to the result array, if they are already in the input string # prior to PR # 13, it seems like it would add back anything in [^a-zA-Z0-9']+ # for i, punctuation in enumerate(punctuations): # new_texts.insert(2*i+1, punctuation) From 850f482f54eae19cd4ba0d2cf5426d378588e8ec Mon Sep 17 00:00:00 2001 From: Santosh Date: Sat, 18 Feb 2023 16:44:32 -0800 Subject: [PATCH 5/5] update comments 4 (will squash these) --- wordninja.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wordninja.py b/wordninja.py index e0aba40..4000109 100644 --- a/wordninja.py +++ b/wordninja.py @@ -40,7 +40,7 @@ def split(self, s): texts = _SPLIT_RE.split(s) assert len(punctuations) + 1 == len(texts) new_texts = [self._split(x) for x in texts] - # this just seems to add spaces (after PR #13) to the result array, if they are already in the input string + # this just seems to add spaces (after PR #13) + the new delimiters in the regex to the result array, if they are already in the input string # prior to PR # 13, it seems like it would add back anything in [^a-zA-Z0-9']+ # for i, punctuation in enumerate(punctuations): # new_texts.insert(2*i+1, punctuation)