Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
build/
dist/
snudown.egg-info/
src/html_entities.gperf.generated
src/html_entities.h
*.pyc
*.so
Expand Down
70 changes: 67 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from distutils.spawn import find_executable
from distutils.dep_util import newer_group
from setuptools import setup, Extension
from setuptools.command.build_ext import build_ext

import re
import os
import subprocess
import fnmatch
import json

def c_files_in(directory):
paths = []
Expand All @@ -14,11 +16,71 @@ def c_files_in(directory):
paths.append(os.path.join(directory, f))
return paths

def send_html_entities(entities_file, outfh, seen_entities):
# Convert entity list from HTML5 spec JSON and send it to gperf
with open(entities_file) as entitiesfh:
for entity, entityinfo in sorted(json.load(entitiesfh).items()):
if not entity.endswith(';'):
continue
if entity in seen_entities:
continue
seen_entities.add(entity)

def process_gperf_file(gperf_file, output_file):
# Some sanity checks on the codepoints
ncodepoints = len(entityinfo['codepoints'])
assert ncodepoints <= 2 # MAX_ENTITY_CODEPOINTS
for bad_range in [xrange(0, 9), xrange(11, 13), xrange(14, 32),
xrange(55296, 57344), xrange(65534, 65536)]:
for codepoint in entityinfo['codepoints']:
assert codepoint not in bad_range
codepoints = ",".join(str(x) for x in entityinfo['codepoints'])

# Output the entity
outfh.write("%s, %d, {%s}\n" % (entity, ncodepoints, codepoints))


def process_gperf_file(gperf_file, entities_file, output_file, force=False):
if not find_executable("gperf"):
raise Exception("Couldn't find `gperf`, is it installed?")
subprocess.check_call(["gperf", gperf_file, "--output-file=%s" % output_file])

# Do not rerun gperf if no change to input files
if not force and not newer_group((gperf_file, entities_file), output_file):
return

# Combine HTML5 entity data into the gperf input file. HTML5
# entities are translated to numeric entities at runtime, as
# opposed to the entities already in the gperf file which are
# output verbatim.
gperf_temp_file = gperf_file + ".generated"
seen_entities = set()
found_separator = 0
with open(gperf_temp_file, 'w') as outfh:
with open(gperf_file) as f:
for line in f:
entity = line.strip()
# gperf files are divided into three sections, divided
# by `%%` lines. The first is for declarations, the
# second is the list of keywords, and the third is for
# functions (which are included verbatim in the
# output). We want to add the HTML5 entities to the
# end of the second section (i.e. right before the
# third section).
if entity == '%%':
found_separator += 1
if found_separator == 2:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: A comment explaning which number maps to which section would be helpful for people who don't know gperf well

send_html_entities(entities_file, outfh, seen_entities)
elif found_separator == 1:
# Track the entities we've seen so far so that we
# don't repeat them.
entity = entity.split()[0]
seen_entities.add(entity)
outfh.write(line)
if found_separator < 2:
# The gperf file contains no third section.
send_html_entities(entities_file, outfh, seen_entities)

subprocess.check_call(["gperf", gperf_temp_file,
"--output-file", output_file])

version = None
version_re = re.compile(r'^#define\s+SNUDOWN_VERSION\s+"([^"]+)"$')
Expand All @@ -32,7 +94,8 @@ def process_gperf_file(gperf_file, output_file):

class GPerfingBuildExt(build_ext):
def run(self):
process_gperf_file("src/html_entities.gperf", "src/html_entities.h")
process_gperf_file("src/html_entities.gperf", "src/html_entities.json",
"src/html_entities.h", force=self.force)
build_ext.run(self)

setup(
Expand All @@ -47,6 +110,7 @@ def run(self):
Extension(
name='snudown',
sources=['snudown.c'] + c_files_in('src/') + c_files_in('html/'),
depends=['src/html_entities.h'],
include_dirs=['src', 'html']
)
],
Expand Down
30 changes: 24 additions & 6 deletions src/html_entities.gperf
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
%language=ANSI-C
%define lookup-function-name is_allowed_named_entity
%define lookup-function-name resolve_named_entity
%compare-strncmp
%readonly-tables
%define hash-function-name hash_html_entity
Expand All @@ -21,11 +21,11 @@ inline int is_valid_numeric_entity(uint32_t entity_val)
*
* According to lxml these are all problematic:
*
* [xrange(0, 8),
* xrange(11, 12),
* xrange(14, 31),
* xrange(55296, 57343),
* xrange(65534, 65535)]
* [xrange(0, 9),
* xrange(11, 13),
* xrange(14, 32),
* xrange(55296, 57344),
* xrange(65534, 65536)]
*/
return (entity_val > 8
&& (entity_val != 11 && entity_val != 12)
Expand All @@ -35,8 +35,26 @@ inline int is_valid_numeric_entity(uint32_t entity_val)
&& entity_val <= MAX_NUM_ENTITY_VAL);
}

/* Maximum number of codepoints for a named entity. */
#define MAX_ENTITY_CODEPOINTS 2
%}
%struct-type
%define slot-name entity
struct html_entity {
/* Entity string */
const char *entity;
/* Number of codepoints; 0 for raw output. */
const int output_numeric;
/* Codepoint numbers for numeric output. */
const int codepoints[MAX_ENTITY_CODEPOINTS];
}
%%
# The following entities are allowed verbatim, due to their broad
# support (e.g. in XHTML). Additional entities are supported by HTML5,
# which are converted by Snudown to numeric entities to maintain
# compatibility. The HTML5 entities are inserted by setup.py based on
# html_entities.json. Source:
# http://www.w3.org/TR/2014/REC-html5-20141028/entities.json
&AElig;
&Aacute;
&Acirc;
Expand Down
Loading