From d74c15572d0fa570dee87470660a353bc3a2c947 Mon Sep 17 00:00:00 2001 From: alekssamos Date: Sun, 9 Aug 2020 13:58:49 +0300 Subject: [PATCH 1/8] Close btimby/fulltext#97 Add fb2 extension backend --- README.rst | 4 +++ fulltext/__init__.py | 6 ++++ fulltext/backends/__fb2.py | 61 ++++++++++++++++++++++++++++++++++++++ setup.py | 2 +- 4 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 fulltext/backends/__fb2.py diff --git a/README.rst b/README.rst index 0858c24..6da5bcf 100644 --- a/README.rst +++ b/README.rst @@ -54,6 +54,8 @@ Supported formats +-----------+-------------------------------------+----------------------------------------------+ | ``epub`` | ``ebooklib`` module | ``ebooklib`` module | +-----------+-------------------------------------+----------------------------------------------+ +| ``fb2`` | ``BeautifulSoup`` module | ``BeautifulSoup`` module | ++-----------+-------------------------------------+----------------------------------------------+ | ``gif`` | tesseract CLI and pytesserac module | | +-----------+-------------------------------------+----------------------------------------------+ | ``gz`` | python ``gzip`` module | python ``gzip`` module | @@ -116,6 +118,8 @@ file extensions: +-----------+-------------------------------------+----------------------------------------------+ | ``epub`` | ``exiftool`` CLI tool | | +-----------+-------------------------------------+----------------------------------------------+ +| ``fb2`` | ``BeautifulSoup`` module | ``BeautifulSoup`` module | ++-----------+-------------------------------------+----------------------------------------------+ | ``html`` | ``BeautifulSoup`` module | ``BeautifulSoup`` module | +-----------+-------------------------------------+----------------------------------------------+ | ``odt`` | ``exiftool`` CLI tool | ``exiftool`` CLI tool | diff --git a/fulltext/__init__.py b/fulltext/__init__.py index 6b7b57d..2b64c29 100644 --- a/fulltext/__init__.py +++ b/fulltext/__init__.py @@ -113,6 +113,7 @@ from fulltext.backends import __docx # NOQA from fulltext.backends import __eml # NOQA from fulltext.backends import __epub # NOQA + from fulltext.backends import __fb2 # NOQA from fulltext.backends import __gz # NOQA from fulltext.backends import __html # NOQA from fulltext.backends import __hwp # NOQA @@ -276,6 +277,11 @@ def register_backend(mimetype, module, extensions=None): 'fulltext.backends.__epub', extensions=[".epub"]) +register_backend( + "application/fb2", + 'fulltext.backends.__fb2', + extensions=[".fb2"]) + register_backend( 'application/postscript', 'fulltext.backends.__ps', diff --git a/fulltext/backends/__fb2.py b/fulltext/backends/__fb2.py new file mode 100644 index 0000000..7bcbc26 --- /dev/null +++ b/fulltext/backends/__fb2.py @@ -0,0 +1,61 @@ +from __future__ import absolute_import + +import re + +import bs4 + +from six import StringIO +from six import PY3 + +from fulltext.util import BaseBackend + + +class Backend(BaseBackend): + + def setup(self): + self.bs = None + + def is_visible(self, elem): + if isinstance(elem, (bs4.element.ProcessingInstruction, + bs4.element.Doctype)): + return False + + if elem.parent.name not in ["body", "p"]: + return False + + if not PY3: + elem = elem.encode(self.encoding, self.encoding_errors) + + if re.match('', elem): + return False + + return True + + def handle_fobj(self, f): + bdata = f.read() + tdata = self.decode(bdata) + text, bs = StringIO(), bs4.BeautifulSoup(tdata, 'lxml') + + for elem in bs.findAll(text=True): + if elem.parent.name == "empty-line": + text.write(u"\n") + if self.is_visible(elem): + text.write(elem) + text.write(u"\n") + + return text.getvalue() + + def handle_title(self, f): + fname = "" + s = "" + try: + fname = f.name + except AttributeError: + fname = f + + with open(fname, "r", encoding = self.encoding, errors = self.encoding_errors) as book: s = book.read() + bs = bs4.BeautifulSoup(s, 'lxml') + t = getattr(bs, "book-title", None) + if t is None: + return None + return getattr(t, "string", None) diff --git a/setup.py b/setup.py index a815a73..e77835b 100755 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ NAME = 'fulltext' -VERSION = '0.7' +VERSION = '0.8' if os.name == 'nt' and not sys.maxsize > 2 ** 32: # https://github.com/btimby/fulltext/issues/79 raise RuntimeError("Python 32 bit is not supported") From 52a016edcfcf43dd1221d55ed32748855ef9988e Mon Sep 17 00:00:00 2001 From: VonC Date: Thu, 13 Aug 2020 18:28:42 +0200 Subject: [PATCH 2/8] Add space to restore table The table is now visible, again. --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 6da5bcf..eadd27c 100644 --- a/README.rst +++ b/README.rst @@ -54,7 +54,7 @@ Supported formats +-----------+-------------------------------------+----------------------------------------------+ | ``epub`` | ``ebooklib`` module | ``ebooklib`` module | +-----------+-------------------------------------+----------------------------------------------+ -| ``fb2`` | ``BeautifulSoup`` module | ``BeautifulSoup`` module | +| ``fb2`` | ``BeautifulSoup`` module | ``BeautifulSoup`` module | +-----------+-------------------------------------+----------------------------------------------+ | ``gif`` | tesseract CLI and pytesserac module | | +-----------+-------------------------------------+----------------------------------------------+ From 4df4e2ee783f9cc5b7cee3930c4c90ae6933104a Mon Sep 17 00:00:00 2001 From: alekssamos Date: Fri, 14 Aug 2020 21:46:54 +0300 Subject: [PATCH 3/8] Add space to restore table 2 The table 2 is now visible, again. --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index eadd27c..7e30906 100644 --- a/README.rst +++ b/README.rst @@ -118,7 +118,7 @@ file extensions: +-----------+-------------------------------------+----------------------------------------------+ | ``epub`` | ``exiftool`` CLI tool | | +-----------+-------------------------------------+----------------------------------------------+ -| ``fb2`` | ``BeautifulSoup`` module | ``BeautifulSoup`` module | +| ``fb2`` | ``BeautifulSoup`` module | ``BeautifulSoup`` module | +-----------+-------------------------------------+----------------------------------------------+ | ``html`` | ``BeautifulSoup`` module | ``BeautifulSoup`` module | +-----------+-------------------------------------+----------------------------------------------+ From 4a439915b9fa9740c8c57e0a3a44d2fe7707a652 Mon Sep 17 00:00:00 2001 From: Alexey Date: Thu, 21 Oct 2021 22:43:05 +0300 Subject: [PATCH 4/8] edit info end bump version --- setup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index e77835b..f1a85a5 100755 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ NAME = 'fulltext' -VERSION = '0.8' +VERSION = '0.9' if os.name == 'nt' and not sys.maxsize > 2 ** 32: # https://github.com/btimby/fulltext/issues/79 raise RuntimeError("Python 32 bit is not supported") @@ -36,8 +36,8 @@ version=VERSION, description='Convert binary files to plain text for indexing.', long_description=DESCRIPTION, - author='Ben Timby', - author_email='btimby@gmail.com', + author='Ben Timby, alekssamos', + author_email='btimby@gmail.com, aleks-samos@yandex.ru', maintainer='Ben Timby', maintainer_email='btimby@gmail.com', url='http://github.com/btimby/' + NAME + '/', From e967048da5d31b0eb8afb223ea8b4c9e3df8ea2a Mon Sep 17 00:00:00 2001 From: Alexey Date: Thu, 21 Oct 2021 22:43:59 +0300 Subject: [PATCH 5/8] add chardet --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 202dbe3..d246e0e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,3 +19,4 @@ flake8 git+https://github.com/mattgwwalker/msg-extractor.git@2a24c9950d34932ed5979693cb70d758d78715df#egg=ExtractMsg git+https://github.com/smarnach/pyexiftool.git#egg=pyexiftool +chardet From 28f1731fd9bc3f1e794d2b7941c4b3827bbe8fa4 Mon Sep 17 00:00:00 2001 From: Alexey Date: Thu, 21 Oct 2021 22:44:21 +0300 Subject: [PATCH 6/8] fix encoding errors --- fulltext/util.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fulltext/util.py b/fulltext/util.py index f9bbf1d..d64f5a1 100644 --- a/fulltext/util.py +++ b/fulltext/util.py @@ -373,7 +373,11 @@ def check(self, title): def decode(self, s): """Decode string.""" - return s.decode(self.encoding, self.encoding_errors) + import chardet + r = "" + try: r = s.decode(self.encoding, self.encoding_errors) + except UnicodeDecodeError: r = s.decode(chardet.detect(s)["encoding"], self.encoding_errors) + return r def handle_title(self, path_or_file): """May be overridden by sublass in order to retrieve file title.""" From 57807fc7144959e378bbcd1554059fca258b1d0a Mon Sep 17 00:00:00 2001 From: Alexey Date: Wed, 27 Oct 2021 12:04:57 +0300 Subject: [PATCH 7/8] I need new lines and spaces. --- fulltext/__init__.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fulltext/__init__.py b/fulltext/__init__.py index 2b64c29..9303eed 100644 --- a/fulltext/__init__.py +++ b/fulltext/__init__.py @@ -583,7 +583,7 @@ def _get(path_or_file, default, mime, name, backend, encoding, inst.teardown() assert text is not None, "backend function returned None" - text = STRIP_WHITE.sub(' ', text) + # text = STRIP_WHITE.sub(' ', text) text = text.strip() return (text, title) diff --git a/setup.py b/setup.py index f1a85a5..c17280f 100755 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ NAME = 'fulltext' -VERSION = '0.9' +VERSION = '0.10' if os.name == 'nt' and not sys.maxsize > 2 ** 32: # https://github.com/btimby/fulltext/issues/79 raise RuntimeError("Python 32 bit is not supported") From 73c70f372ae511e65df05ce83f9ffedf30cd5f23 Mon Sep 17 00:00:00 2001 From: Alexey Date: Sat, 29 Jan 2022 18:50:39 +0300 Subject: [PATCH 8/8] replace chardet to charset_normalizer --- fulltext/util.py | 4 ++-- requirements.txt | 2 +- setup.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fulltext/util.py b/fulltext/util.py index d64f5a1..a63098a 100644 --- a/fulltext/util.py +++ b/fulltext/util.py @@ -373,10 +373,10 @@ def check(self, title): def decode(self, s): """Decode string.""" - import chardet + from charset_normalizer import detect r = "" try: r = s.decode(self.encoding, self.encoding_errors) - except UnicodeDecodeError: r = s.decode(chardet.detect(s)["encoding"], self.encoding_errors) + except UnicodeDecodeError: r = s.decode(detect(s)["encoding"], self.encoding_errors) return r def handle_title(self, path_or_file): diff --git a/requirements.txt b/requirements.txt index d246e0e..29bd05a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,4 +19,4 @@ flake8 git+https://github.com/mattgwwalker/msg-extractor.git@2a24c9950d34932ed5979693cb70d758d78715df#egg=ExtractMsg git+https://github.com/smarnach/pyexiftool.git#egg=pyexiftool -chardet +charset-normalizer diff --git a/setup.py b/setup.py index c17280f..88553b6 100755 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ NAME = 'fulltext' -VERSION = '0.10' +VERSION = '0.11' if os.name == 'nt' and not sys.maxsize > 2 ** 32: # https://github.com/btimby/fulltext/issues/79 raise RuntimeError("Python 32 bit is not supported")