Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ Supported formats
+-----------+-------------------------------------+----------------------------------------------+
| ``epub`` | ``ebooklib`` module | ``ebooklib`` module |
+-----------+-------------------------------------+----------------------------------------------+
| ``fb2`` | ``BeautifulSoup`` module | ``BeautifulSoup`` module |
+-----------+-------------------------------------+----------------------------------------------+
| ``gif`` | tesseract CLI and pytesserac module | |
+-----------+-------------------------------------+----------------------------------------------+
| ``gz`` | python ``gzip`` module | python ``gzip`` module |
Expand Down Expand Up @@ -116,6 +118,8 @@ file extensions:
+-----------+-------------------------------------+----------------------------------------------+
| ``epub`` | ``exiftool`` CLI tool | |
+-----------+-------------------------------------+----------------------------------------------+
| ``fb2`` | ``BeautifulSoup`` module | ``BeautifulSoup`` module |
+-----------+-------------------------------------+----------------------------------------------+
| ``html`` | ``BeautifulSoup`` module | ``BeautifulSoup`` module |
+-----------+-------------------------------------+----------------------------------------------+
| ``odt`` | ``exiftool`` CLI tool | ``exiftool`` CLI tool |
Expand Down
8 changes: 7 additions & 1 deletion fulltext/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@
from fulltext.backends import __docx # NOQA
from fulltext.backends import __eml # NOQA
from fulltext.backends import __epub # NOQA
from fulltext.backends import __fb2 # NOQA
from fulltext.backends import __gz # NOQA
from fulltext.backends import __html # NOQA
from fulltext.backends import __hwp # NOQA
Expand Down Expand Up @@ -276,6 +277,11 @@ def register_backend(mimetype, module, extensions=None):
'fulltext.backends.__epub',
extensions=[".epub"])

register_backend(
"application/fb2",
'fulltext.backends.__fb2',
extensions=[".fb2"])

register_backend(
'application/postscript',
'fulltext.backends.__ps',
Expand Down Expand Up @@ -577,7 +583,7 @@ def _get(path_or_file, default, mime, name, backend, encoding,
inst.teardown()

assert text is not None, "backend function returned None"
text = STRIP_WHITE.sub(' ', text)
# text = STRIP_WHITE.sub(' ', text)
text = text.strip()
return (text, title)

Expand Down
61 changes: 61 additions & 0 deletions fulltext/backends/__fb2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from __future__ import absolute_import

import re

import bs4

from six import StringIO
from six import PY3

from fulltext.util import BaseBackend


class Backend(BaseBackend):

def setup(self):
self.bs = None

def is_visible(self, elem):
if isinstance(elem, (bs4.element.ProcessingInstruction,
bs4.element.Doctype)):
return False

if elem.parent.name not in ["body", "p"]:
return False

if not PY3:
elem = elem.encode(self.encoding, self.encoding_errors)

if re.match('<!--.*-->', elem):
return False

return True

def handle_fobj(self, f):
bdata = f.read()
tdata = self.decode(bdata)
text, bs = StringIO(), bs4.BeautifulSoup(tdata, 'lxml')

for elem in bs.findAll(text=True):
if elem.parent.name == "empty-line":
text.write(u"\n")
if self.is_visible(elem):
text.write(elem)
text.write(u"\n")

return text.getvalue()

def handle_title(self, f):
fname = ""
s = ""
try:
fname = f.name
except AttributeError:
fname = f

with open(fname, "r", encoding = self.encoding, errors = self.encoding_errors) as book: s = book.read()
bs = bs4.BeautifulSoup(s, 'lxml')
t = getattr(bs, "book-title", None)
if t is None:
return None
return getattr(t, "string", None)
6 changes: 5 additions & 1 deletion fulltext/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,11 @@ def check(self, title):

def decode(self, s):
"""Decode string."""
return s.decode(self.encoding, self.encoding_errors)
from charset_normalizer import detect
r = ""
try: r = s.decode(self.encoding, self.encoding_errors)
except UnicodeDecodeError: r = s.decode(detect(s)["encoding"], self.encoding_errors)
return r

def handle_title(self, path_or_file):
"""May be overridden by sublass in order to retrieve file title."""
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,4 @@ flake8
git+https://github.com/mattgwwalker/msg-extractor.git@2a24c9950d34932ed5979693cb70d758d78715df#egg=ExtractMsg

git+https://github.com/smarnach/pyexiftool.git#egg=pyexiftool
charset-normalizer
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@


NAME = 'fulltext'
VERSION = '0.7'
VERSION = '0.11'
if os.name == 'nt' and not sys.maxsize > 2 ** 32:
# https://github.com/btimby/fulltext/issues/79
raise RuntimeError("Python 32 bit is not supported")
Expand All @@ -36,8 +36,8 @@
version=VERSION,
description='Convert binary files to plain text for indexing.',
long_description=DESCRIPTION,
author='Ben Timby',
author_email='btimby@gmail.com',
author='Ben Timby, alekssamos',
author_email='btimby@gmail.com, aleks-samos@yandex.ru',
maintainer='Ben Timby',
maintainer_email='btimby@gmail.com',
url='http://github.com/btimby/' + NAME + '/',
Expand Down