btimby · alekssamos · Aug 9, 2020 · Aug 13, 2020 · Aug 14, 2020 · Aug 14, 2020
diff --git a/README.rst b/README.rst
@@ -54,6 +54,8 @@ Supported formats
 +-----------+-------------------------------------+----------------------------------------------+
 | ``epub``  | ``ebooklib`` module                 | ``ebooklib`` module                          |
 +-----------+-------------------------------------+----------------------------------------------+
+| ``fb2``   | ``BeautifulSoup`` module            | ``BeautifulSoup`` module                     |
++-----------+-------------------------------------+----------------------------------------------+
 | ``gif``   | tesseract CLI and pytesserac module |                                              |
 +-----------+-------------------------------------+----------------------------------------------+
 | ``gz``    | python ``gzip`` module              | python ``gzip`` module                       |
@@ -116,6 +118,8 @@ file extensions:
 +-----------+-------------------------------------+----------------------------------------------+
 | ``epub``  | ``exiftool`` CLI tool               |                                              |
 +-----------+-------------------------------------+----------------------------------------------+
+| ``fb2``   | ``BeautifulSoup`` module            | ``BeautifulSoup`` module                     |
++-----------+-------------------------------------+----------------------------------------------+
 | ``html``  | ``BeautifulSoup`` module            | ``BeautifulSoup`` module                     |
 +-----------+-------------------------------------+----------------------------------------------+
 | ``odt``   | ``exiftool`` CLI tool               | ``exiftool`` CLI tool                        |

diff --git a/fulltext/__init__.py b/fulltext/__init__.py
@@ -113,6 +113,7 @@
     from fulltext.backends import __docx  # NOQA
     from fulltext.backends import __eml  # NOQA
     from fulltext.backends import __epub  # NOQA
+    from fulltext.backends import __fb2  # NOQA
     from fulltext.backends import __gz  # NOQA
     from fulltext.backends import __html  # NOQA
     from fulltext.backends import __hwp  # NOQA
@@ -276,6 +277,11 @@ def register_backend(mimetype, module, extensions=None):
         'fulltext.backends.__epub',
         extensions=[".epub"])
 
+register_backend(
+    "application/fb2",
+    'fulltext.backends.__fb2',
+    extensions=[".fb2"])
+
 register_backend(
     'application/postscript',
     'fulltext.backends.__ps',
@@ -577,7 +583,7 @@ def _get(path_or_file, default, mime, name, backend, encoding,
         inst.teardown()
 
     assert text is not None, "backend function returned None"
-    text = STRIP_WHITE.sub(' ', text)
+    # text = STRIP_WHITE.sub(' ', text)
     text = text.strip()
     return (text, title)
 

diff --git a/fulltext/backends/__fb2.py b/fulltext/backends/__fb2.py
@@ -0,0 +1,61 @@
+from __future__ import absolute_import
+
+import re
+
+import bs4
+
+from six import StringIO
+from six import PY3
+
+from fulltext.util import BaseBackend
+
+
+class Backend(BaseBackend):
+
+    def setup(self):
+        self.bs = None
+
+    def is_visible(self, elem):
+        if isinstance(elem, (bs4.element.ProcessingInstruction,
+                             bs4.element.Doctype)):
+            return False
+
+        if elem.parent.name not in ["body", "p"]:
+            return False
+
+        if not PY3:
+            elem = elem.encode(self.encoding, self.encoding_errors)
+
+        if re.match('<!--.*-->', elem):
+            return False
+
+        return True
+
+    def handle_fobj(self, f):
+        bdata = f.read()
+        tdata = self.decode(bdata)
+        text, bs = StringIO(), bs4.BeautifulSoup(tdata, 'lxml')
+
+        for elem in bs.findAll(text=True):
+            if elem.parent.name == "empty-line":
+                text.write(u"\n")
+            if self.is_visible(elem):
+                text.write(elem)
+                text.write(u"\n")
+
+        return text.getvalue()
+
+    def handle_title(self, f):
+        fname = ""
+        s = ""
+        try:
+            fname = f.name
+        except AttributeError:
+            fname = f
+
+        with open(fname, "r", encoding = self.encoding, errors = self.encoding_errors) as book: s = book.read()
+        bs = bs4.BeautifulSoup(s, 'lxml')
+        t = getattr(bs, "book-title", None)
+        if t is None:
+            return None
+        return getattr(t, "string", None)
diff --git a/fulltext/util.py b/fulltext/util.py
@@ -373,7 +373,11 @@ def check(self, title):
 
     def decode(self, s):
         """Decode string."""
-        return s.decode(self.encoding, self.encoding_errors)
+        from charset_normalizer import detect
+        r = ""
+        try: r = s.decode(self.encoding, self.encoding_errors)
+        except UnicodeDecodeError: r = s.decode(detect(s)["encoding"], self.encoding_errors)
+        return r
 
     def handle_title(self, path_or_file):
         """May be overridden by sublass in order to retrieve file title."""

diff --git a/requirements.txt b/requirements.txt
@@ -19,3 +19,4 @@ flake8
 git+https://github.com/mattgwwalker/msg-extractor.git@2a24c9950d34932ed5979693cb70d758d78715df#egg=ExtractMsg
 
 git+https://github.com/smarnach/pyexiftool.git#egg=pyexiftool
+charset-normalizer
diff --git a/setup.py b/setup.py
@@ -9,7 +9,7 @@
 
 
 NAME = 'fulltext'
-VERSION = '0.7'
+VERSION = '0.11'
 if os.name == 'nt' and not sys.maxsize > 2 ** 32:
     # https://github.com/btimby/fulltext/issues/79
     raise RuntimeError("Python 32 bit is not supported")
@@ -36,8 +36,8 @@
     version=VERSION,
     description='Convert binary files to plain text for indexing.',
     long_description=DESCRIPTION,
-    author='Ben Timby',
-    author_email='btimby@gmail.com',
+    author='Ben Timby, alekssamos',
+    author_email='btimby@gmail.com, aleks-samos@yandex.ru',
     maintainer='Ben Timby',
     maintainer_email='btimby@gmail.com',
     url='http://github.com/btimby/' + NAME + '/',
Original file line number	Diff line number	Diff line change
Expand Up		@@ -19,3 +19,4 @@ flake8
		git+https://github.com/mattgwwalker/msg-extractor.git@2a24c9950d34932ed5979693cb70d758d78715df#egg=ExtractMsg

		git+https://github.com/smarnach/pyexiftool.git#egg=pyexiftool
		charset-normalizer