From 9b7987469e8a5ad9caf2479f3e1e82cc819fcbcc Mon Sep 17 00:00:00 2001 From: Ewan Mellor Date: Sat, 26 Jul 2014 23:38:06 -0700 Subject: [PATCH 1/2] Implement [self unread] when using streams for tokenization. Implement parsing of UTF-8 characters when using streams for tokenization. [self unread] is used by many of the PKTokenizerState subclasses, so without this feature, tokenization of streams is basically useless. This change adds a circular buffer for all the data read from the stream, and rewinds through this buffer to handle unreads. This places a limit on the amount of rewinding that can be done (defaults to 256 unichars) but that should be OK for practical purposes. The UTF-8 support brings stream tokenization up to the same support as for strings. The latter uses NSString.characterAtIndex to get UTF-16 code points, and returns those from [self read]. For streams the parsing is not as simple, but the result is now the same. This adds a new field called isStreamInUTF8, to enable the UTF-8 parsing for streams. Otherwise, the code behaves as before (returning data byte-by-byte) for backwards compatibility. This includes code derived from http://opensource.apple.com/source/JavaScriptCore/JavaScriptCore-7534.57.3/wtf/unicode/UTF8.cpp That code has a BSD-style license and is marked as follows: * Copyright (C) 2007 Apple Inc. All rights reserved. * Copyright (C) 2010 Patrick Gansterer --- include/PEGKit/PKReader.h | 7 + include/PEGKit/PKTokenizer.h | 7 + src/PKReader.m | 274 +++++++++++++++++++++++++++++++++-- src/PKTokenizer.m | 10 ++ 4 files changed, 283 insertions(+), 15 deletions(-) diff --git a/include/PEGKit/PKReader.h b/include/PEGKit/PKReader.h index 030a612..07e88ce 100644 --- a/include/PEGKit/PKReader.h +++ b/include/PEGKit/PKReader.h @@ -82,5 +82,12 @@ */ @property (nonatomic, readonly) NSUInteger offset; +/*! + @property isStreamInUTF8 + @brief true if self.stream is parsed as UTF-8, with each result from [self read] being a UTF-16 code point. + false if self.stream is unparsed, with each result from [self read] being a byte from the stream. + */ +@property (nonatomic) BOOL isStreamInUTF8; + - (NSString *)debugDescription; @end diff --git a/include/PEGKit/PKTokenizer.h b/include/PEGKit/PKTokenizer.h index 26e343e..d6dc513 100644 --- a/include/PEGKit/PKTokenizer.h +++ b/include/PEGKit/PKTokenizer.h @@ -176,4 +176,11 @@ @property (nonatomic, readonly) NSUInteger lineNumber; @property (nonatomic, assign) id delegate; + +/*! + @property isStreamInUTF8 + @brief Passthrough to PKReader.isStreamInUTF8. +*/ +@property (nonatomic) BOOL isStreamInUTF8; + @end diff --git a/src/PKReader.m b/src/PKReader.m index 41fbd25..ae06756 100644 --- a/src/PKReader.m +++ b/src/PKReader.m @@ -1,7 +1,8 @@ // The MIT License (MIT) // // Copyright (c) 2014 Todd Ditchendorf -// +// Copyright (c) 2014 Ewan Mellor +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights @@ -20,22 +21,74 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. +/* + * isLegalUTF8, UTF8SequenceLength, and readUTF8Sequence are derived + * from the code detailed below. They have been modified to use + * different types and to conform to the newer UTF-8 specification + * (maximum 4 byte sequences). + * + * http://opensource.apple.com/source/JavaScriptCore/JavaScriptCore-7534.57.3/wtf/unicode/UTF8.cpp + * + * Copyright (C) 2007 Apple Inc. All rights reserved. + * Copyright (C) 2010 Patrick Gansterer + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + #import @interface PKReader () @property (nonatomic) NSUInteger offset; @property (nonatomic) NSUInteger length; + +/*! + @property buffer + @brief A circular buffer holding the last bufsize characters that were read from self.stream. + + This is used to support [self unread]; by moving backwards through this buffer we can unread characters. + If self.length == 0, characters are written at self.offset and self.offset is incremented. + If self.length > 0, characters are read at self.offset and self.offset is incremented and self.length is decremented. + When a character is unread, self.offset is decremented and self.length is incremented. + self.offset wraps at 0 and self.bufsize of course. + + This is not used if self.stream is not in use. + */ +@property (nonatomic) PKUniChar * buffer; +@property (nonatomic) NSUInteger bufsize; @end @implementation PKReader - (instancetype)init { - return [self initWithString:nil]; + self = [super init]; + if (self) { + _bufsize = 256; + } + return self; } - (instancetype)initWithString:(NSString *)s { - self = [super init]; + self = [self init]; if (self) { self.string = s; } @@ -44,7 +97,7 @@ - (instancetype)initWithString:(NSString *)s { - (instancetype)initWithStream:(NSInputStream *)s { - self = [super init]; + self = [self init]; if (self) { self.stream = s; } @@ -55,6 +108,7 @@ - (instancetype)initWithStream:(NSInputStream *)s { - (void)dealloc { self.string = nil; self.stream = nil; + free(self.buffer); [super dealloc]; } @@ -87,31 +141,158 @@ - (void)setStream:(NSInputStream *)s { _length = NSNotFound; } // reset cursor + free(self.buffer); + self.buffer = malloc(sizeof(PKUniChar) * self.bufsize); self.offset = 0; + self.length = 0; } - (PKUniChar)read { - PKUniChar result = PKEOF; - if (_string) { - if (_length && _offset < _length) { - result = [_string characterAtIndex:self.offset++]; + return [self readFromString]; + } + else { + if (self.length > 0) { + return [self popPKUniCharFromBuffer]; + } + else if (self.isStreamInUTF8) { + return [self readFromStreamInUTF8]; } - } else { - NSUInteger maxLen = 1; // 2 for wide char? - uint8_t c; - if ([_stream read:&c maxLength:maxLen]) { - result = (PKUniChar)c; + else { + return [self readFromStreamAsBytes]; } } - +} + + +-(PKUniChar)readFromString { + if (_length && _offset < _length) { + return [_string characterAtIndex:self.offset++]; + } + else { + return PKEOF; + } +} + + +-(PKUniChar)readFromStreamInUTF8 { + UTF32Char ch32 = [self readUTF32Char]; + if (ch32 == (UTF32Char)-1) { + return PKEOF; + } + + unichar unichars[2]; + BOOL isPair = CFStringGetSurrogatePairForLongCharacter(ch32, unichars); + if (isPair) { + // ch32 is represented by two unichars (two UTF-16 code points). + // Return the first, and put the second in the buffer and unread it so that + // it will be returned next time. + [self addPKUniCharToBuffer:unichars[0]]; + [self addPKUniCharToBuffer:unichars[1]]; + [self unread]; + } + else { + [self addPKUniCharToBuffer:unichars[0]]; + } + return unichars[0]; +} + + +-(PKUniChar)readFromStreamAsBytes { + PKUniChar result = [self readByte]; + if (result != PKEOF) { + [self addPKUniCharToBuffer:result]; + } + return result; +} + + +-(PKUniChar)readByte { + uint8_t c; + if ([self.stream read:&c maxLength:1]) { + return (PKUniChar)c; + } + else { + return PKEOF; + } +} + + +-(UTF32Char)readUTF32Char { + uint8_t bytes[4]; + + NSInteger read = [self.stream read:bytes maxLength:1]; + if (read <= 0) { + return (UTF32Char)-1; + } + size_t seqlen = UTF8SequenceLength(bytes[0]); + size_t byteCount = 1; + +#define LOGGABLE_BYTE(__i) \ + (__i < byteCount ? (unsigned)bytes[__i] : UINT_MAX) +#define LOGGABLE_BYTES \ + LOGGABLE_BYTE(0), LOGGABLE_BYTE(1), LOGGABLE_BYTE(2), LOGGABLE_BYTE(3) + + while (byteCount < seqlen) { + NSInteger read = [self.stream read:(bytes + byteCount) maxLength:1]; + if (read <= 0) { + NSLog(@"Invalid UTF-8 sequence %x%x%x%x followed by EOF", LOGGABLE_BYTES); + return (UTF32Char)-1; + } + byteCount++; + } + if (isLegalUTF8(bytes, seqlen)) { + return readUTF8Sequence(bytes, seqlen); + } + else { + NSLog(@"Invalid UTF-8 sequence %x%x%x%x.", LOGGABLE_BYTES); + return (UTF32Char)-1; + } + +#undef LOGGABLE_BYTE +#undef LOGGABLE_BYTES +} + + +-(void)addPKUniCharToBuffer:(PKUniChar)ch { + assert(ch != PKEOF); + assert(self.length == 0); + + self.buffer[self.offset] = ch; + self.offset++; + if (self.offset >= self.bufsize) { + self.offset = 0; + } +} + + +-(PKUniChar)popPKUniCharFromBuffer { + assert(self.length > 0); + + PKUniChar result = self.buffer[self.offset]; + self.offset++; + if (self.offset >= self.bufsize) { + self.offset = 0; + } + self.length--; return result; } - (void)unread { - self.offset = (0 == _offset) ? 0 : _offset - 1; + if (self.stream) { + if (self.offset == 0) { + self.offset = self.bufsize - 1; + } + else { + self.offset--; + } + self.length++; + } + else { + self.offset = (0 == _offset) ? 0 : _offset - 1; + } } @@ -121,4 +302,67 @@ - (void)unread:(NSUInteger)count { } } + +static int UTF8SequenceLength(uint8_t b0) { + if ((b0 & 0x80) == 0) + return 1; + if ((b0 & 0xC0) != 0xC0) + return 0; + if ((b0 & 0xE0) == 0xC0) + return 2; + if ((b0 & 0xF0) == 0xE0) + return 3; + if ((b0 & 0xF8) == 0xF0) + return 4; + return 0; +} + + +// This must be called with the length pre-determined by the first byte (i.e. by UTF8SequenceLength). +// If presented with a length > 4, this returns false. The Unicode +// definition of UTF-8 goes up to 4-byte sequences. +static bool isLegalUTF8(const uint8_t * source, size_t length) { + uint8_t a; + const uint8_t * srcptr = source + length; + switch (length) { + default: return false; + // Everything else falls through when "true"... + case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 2: if ((a = (*--srcptr)) > 0xBF) return false; + + switch (*source) { + // no fall-through in this inner switch + case 0xE0: if (a < 0xA0) return false; break; + case 0xED: if (a > 0x9F) return false; break; + case 0xF0: if (a < 0x90) return false; break; + case 0xF4: if (a > 0x8F) return false; break; + default: if (a < 0x80) return false; + } + + case 1: if (*source >= 0x80 && *source < 0xC2) return false; + } + if (*source > 0xF4) + return false; + return true; +} + + +static const UTF32Char offsetsFromUTF8[4] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL }; + +static UTF32Char readUTF8Sequence(const uint8_t * sequence, size_t length) { + UTF32Char character = 0; + + // The cases all fall through. + switch (length) { + case 4: character += *sequence++; character <<= 6; + case 3: character += *sequence++; character <<= 6; + case 2: character += *sequence++; character <<= 6; + case 1: character += *sequence++; + } + + return character - offsetsFromUTF8[length - 1]; +} + + @end diff --git a/src/PKTokenizer.m b/src/PKTokenizer.m index 1d1cafd..ad0d861 100644 --- a/src/PKTokenizer.m +++ b/src/PKTokenizer.m @@ -274,6 +274,16 @@ - (void)setStream:(NSInputStream *)s { } +-(void)setIsStreamInUTF8:(BOOL)isStreamInUTF8 { + self.reader.isStreamInUTF8 = isStreamInUTF8; +} + + +-(BOOL)isStreamInUTF8 { + return self.reader.isStreamInUTF8; +} + + #pragma mark - - (PKTokenizerState *)tokenizerStateFor:(PKUniChar)c { From d5afed970fc0d70f3bb950063260c8dfa8644c80 Mon Sep 17 00:00:00 2001 From: Ewan Mellor Date: Mon, 28 Dec 2015 19:05:05 -0800 Subject: [PATCH 2/2] Fix PKReader.offset in the case where streams are being used. The unread functionality added in 9b7987469e used self.offset as the offset into self.buffer. That's no good though, because self.offset is externally expected to be the offset into the whole input (i.e. the stream) and not the offset into an internal buffer. Fix this by adding a separate self.bufOffset. --- include/PEGKit/PKReader.h | 2 +- src/PKReader.m | 31 ++++++++++++++++++++----------- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/include/PEGKit/PKReader.h b/include/PEGKit/PKReader.h index 07e88ce..57d847c 100644 --- a/include/PEGKit/PKReader.h +++ b/include/PEGKit/PKReader.h @@ -78,7 +78,7 @@ /*! @property offset - @brief This reader's current offset in string + @brief This reader's current offset in string or the offset from the start of the stream. */ @property (nonatomic, readonly) NSUInteger offset; diff --git a/src/PKReader.m b/src/PKReader.m index ae06756..04a8a95 100644 --- a/src/PKReader.m +++ b/src/PKReader.m @@ -58,6 +58,7 @@ @interface PKReader () @property (nonatomic) NSUInteger offset; +@property (nonatomic) NSUInteger bufOffset; @property (nonatomic) NSUInteger length; /*! @@ -71,6 +72,9 @@ @interface PKReader () self.offset wraps at 0 and self.bufsize of course. This is not used if self.stream is not in use. + + Note that self.offset and self.bufOffset are not the same, because self.offset is defined to be + from the start of the stream, but self.bufOffset wraps at the end of the buffer. */ @property (nonatomic) PKUniChar * buffer; @property (nonatomic) NSUInteger bufsize; @@ -144,6 +148,7 @@ - (void)setStream:(NSInputStream *)s { free(self.buffer); self.buffer = malloc(sizeof(PKUniChar) * self.bufsize); self.offset = 0; + self.bufOffset = 0; self.length = 0; } @@ -153,6 +158,7 @@ - (PKUniChar)read { return [self readFromString]; } else { + self.offset++; if (self.length > 0) { return [self popPKUniCharFromBuffer]; } @@ -259,10 +265,10 @@ -(void)addPKUniCharToBuffer:(PKUniChar)ch { assert(ch != PKEOF); assert(self.length == 0); - self.buffer[self.offset] = ch; - self.offset++; - if (self.offset >= self.bufsize) { - self.offset = 0; + self.buffer[self.bufOffset] = ch; + self.bufOffset++; + if (self.bufOffset >= self.bufsize) { + self.bufOffset = 0; } } @@ -270,10 +276,10 @@ -(void)addPKUniCharToBuffer:(PKUniChar)ch { -(PKUniChar)popPKUniCharFromBuffer { assert(self.length > 0); - PKUniChar result = self.buffer[self.offset]; - self.offset++; - if (self.offset >= self.bufsize) { - self.offset = 0; + PKUniChar result = self.buffer[self.bufOffset]; + self.bufOffset++; + if (self.bufOffset >= self.bufsize) { + self.bufOffset = 0; } self.length--; return result; @@ -282,11 +288,14 @@ -(PKUniChar)popPKUniCharFromBuffer { - (void)unread { if (self.stream) { - if (self.offset == 0) { - self.offset = self.bufsize - 1; + assert(self.offset > 0); + self.offset--; + + if (self.bufOffset == 0) { + self.bufOffset = self.bufsize - 1; } else { - self.offset--; + self.bufOffset--; } self.length++; }