diff --git a/README.rst b/README.rst index 0858c24..7e30906 100644 --- a/README.rst +++ b/README.rst @@ -54,6 +54,8 @@ Supported formats +-----------+-------------------------------------+----------------------------------------------+ | ``epub`` | ``ebooklib`` module | ``ebooklib`` module | +-----------+-------------------------------------+----------------------------------------------+ +| ``fb2`` | ``BeautifulSoup`` module | ``BeautifulSoup`` module | ++-----------+-------------------------------------+----------------------------------------------+ | ``gif`` | tesseract CLI and pytesserac module | | +-----------+-------------------------------------+----------------------------------------------+ | ``gz`` | python ``gzip`` module | python ``gzip`` module | @@ -116,6 +118,8 @@ file extensions: +-----------+-------------------------------------+----------------------------------------------+ | ``epub`` | ``exiftool`` CLI tool | | +-----------+-------------------------------------+----------------------------------------------+ +| ``fb2`` | ``BeautifulSoup`` module | ``BeautifulSoup`` module | ++-----------+-------------------------------------+----------------------------------------------+ | ``html`` | ``BeautifulSoup`` module | ``BeautifulSoup`` module | +-----------+-------------------------------------+----------------------------------------------+ | ``odt`` | ``exiftool`` CLI tool | ``exiftool`` CLI tool | diff --git a/fulltext/__init__.py b/fulltext/__init__.py index 6b7b57d..9303eed 100644 --- a/fulltext/__init__.py +++ b/fulltext/__init__.py @@ -113,6 +113,7 @@ from fulltext.backends import __docx # NOQA from fulltext.backends import __eml # NOQA from fulltext.backends import __epub # NOQA + from fulltext.backends import __fb2 # NOQA from fulltext.backends import __gz # NOQA from fulltext.backends import __html # NOQA from fulltext.backends import __hwp # NOQA @@ -276,6 +277,11 @@ def register_backend(mimetype, module, extensions=None): 'fulltext.backends.__epub', extensions=[".epub"]) +register_backend( + "application/fb2", + 'fulltext.backends.__fb2', + extensions=[".fb2"]) + register_backend( 'application/postscript', 'fulltext.backends.__ps', @@ -577,7 +583,7 @@ def _get(path_or_file, default, mime, name, backend, encoding, inst.teardown() assert text is not None, "backend function returned None" - text = STRIP_WHITE.sub(' ', text) + # text = STRIP_WHITE.sub(' ', text) text = text.strip() return (text, title) diff --git a/fulltext/backends/__fb2.py b/fulltext/backends/__fb2.py new file mode 100644 index 0000000..7bcbc26 --- /dev/null +++ b/fulltext/backends/__fb2.py @@ -0,0 +1,61 @@ +from __future__ import absolute_import + +import re + +import bs4 + +from six import StringIO +from six import PY3 + +from fulltext.util import BaseBackend + + +class Backend(BaseBackend): + + def setup(self): + self.bs = None + + def is_visible(self, elem): + if isinstance(elem, (bs4.element.ProcessingInstruction, + bs4.element.Doctype)): + return False + + if elem.parent.name not in ["body", "p"]: + return False + + if not PY3: + elem = elem.encode(self.encoding, self.encoding_errors) + + if re.match('', elem): + return False + + return True + + def handle_fobj(self, f): + bdata = f.read() + tdata = self.decode(bdata) + text, bs = StringIO(), bs4.BeautifulSoup(tdata, 'lxml') + + for elem in bs.findAll(text=True): + if elem.parent.name == "empty-line": + text.write(u"\n") + if self.is_visible(elem): + text.write(elem) + text.write(u"\n") + + return text.getvalue() + + def handle_title(self, f): + fname = "" + s = "" + try: + fname = f.name + except AttributeError: + fname = f + + with open(fname, "r", encoding = self.encoding, errors = self.encoding_errors) as book: s = book.read() + bs = bs4.BeautifulSoup(s, 'lxml') + t = getattr(bs, "book-title", None) + if t is None: + return None + return getattr(t, "string", None) diff --git a/fulltext/util.py b/fulltext/util.py index f9bbf1d..a63098a 100644 --- a/fulltext/util.py +++ b/fulltext/util.py @@ -373,7 +373,11 @@ def check(self, title): def decode(self, s): """Decode string.""" - return s.decode(self.encoding, self.encoding_errors) + from charset_normalizer import detect + r = "" + try: r = s.decode(self.encoding, self.encoding_errors) + except UnicodeDecodeError: r = s.decode(detect(s)["encoding"], self.encoding_errors) + return r def handle_title(self, path_or_file): """May be overridden by sublass in order to retrieve file title.""" diff --git a/requirements.txt b/requirements.txt index 202dbe3..29bd05a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,3 +19,4 @@ flake8 git+https://github.com/mattgwwalker/msg-extractor.git@2a24c9950d34932ed5979693cb70d758d78715df#egg=ExtractMsg git+https://github.com/smarnach/pyexiftool.git#egg=pyexiftool +charset-normalizer diff --git a/setup.py b/setup.py index a815a73..88553b6 100755 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ NAME = 'fulltext' -VERSION = '0.7' +VERSION = '0.11' if os.name == 'nt' and not sys.maxsize > 2 ** 32: # https://github.com/btimby/fulltext/issues/79 raise RuntimeError("Python 32 bit is not supported") @@ -36,8 +36,8 @@ version=VERSION, description='Convert binary files to plain text for indexing.', long_description=DESCRIPTION, - author='Ben Timby', - author_email='btimby@gmail.com', + author='Ben Timby, alekssamos', + author_email='btimby@gmail.com, aleks-samos@yandex.ru', maintainer='Ben Timby', maintainer_email='btimby@gmail.com', url='http://github.com/btimby/' + NAME + '/',