diff --git a/include/PEGKit/PKReader.h b/include/PEGKit/PKReader.h index 030a612..57d847c 100644 --- a/include/PEGKit/PKReader.h +++ b/include/PEGKit/PKReader.h @@ -78,9 +78,16 @@ /*! @property offset - @brief This reader's current offset in string + @brief This reader's current offset in string or the offset from the start of the stream. */ @property (nonatomic, readonly) NSUInteger offset; +/*! + @property isStreamInUTF8 + @brief true if self.stream is parsed as UTF-8, with each result from [self read] being a UTF-16 code point. + false if self.stream is unparsed, with each result from [self read] being a byte from the stream. + */ +@property (nonatomic) BOOL isStreamInUTF8; + - (NSString *)debugDescription; @end diff --git a/include/PEGKit/PKTokenizer.h b/include/PEGKit/PKTokenizer.h index 26e343e..d6dc513 100644 --- a/include/PEGKit/PKTokenizer.h +++ b/include/PEGKit/PKTokenizer.h @@ -176,4 +176,11 @@ @property (nonatomic, readonly) NSUInteger lineNumber; @property (nonatomic, assign) id delegate; + +/*! + @property isStreamInUTF8 + @brief Passthrough to PKReader.isStreamInUTF8. +*/ +@property (nonatomic) BOOL isStreamInUTF8; + @end diff --git a/src/PKReader.m b/src/PKReader.m index 41fbd25..04a8a95 100644 --- a/src/PKReader.m +++ b/src/PKReader.m @@ -1,7 +1,8 @@ // The MIT License (MIT) // // Copyright (c) 2014 Todd Ditchendorf -// +// Copyright (c) 2014 Ewan Mellor +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights @@ -20,22 +21,78 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. +/* + * isLegalUTF8, UTF8SequenceLength, and readUTF8Sequence are derived + * from the code detailed below. They have been modified to use + * different types and to conform to the newer UTF-8 specification + * (maximum 4 byte sequences). + * + * http://opensource.apple.com/source/JavaScriptCore/JavaScriptCore-7534.57.3/wtf/unicode/UTF8.cpp + * + * Copyright (C) 2007 Apple Inc. All rights reserved. + * Copyright (C) 2010 Patrick Gansterer + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + #import @interface PKReader () @property (nonatomic) NSUInteger offset; +@property (nonatomic) NSUInteger bufOffset; @property (nonatomic) NSUInteger length; + +/*! + @property buffer + @brief A circular buffer holding the last bufsize characters that were read from self.stream. + + This is used to support [self unread]; by moving backwards through this buffer we can unread characters. + If self.length == 0, characters are written at self.offset and self.offset is incremented. + If self.length > 0, characters are read at self.offset and self.offset is incremented and self.length is decremented. + When a character is unread, self.offset is decremented and self.length is incremented. + self.offset wraps at 0 and self.bufsize of course. + + This is not used if self.stream is not in use. + + Note that self.offset and self.bufOffset are not the same, because self.offset is defined to be + from the start of the stream, but self.bufOffset wraps at the end of the buffer. + */ +@property (nonatomic) PKUniChar * buffer; +@property (nonatomic) NSUInteger bufsize; @end @implementation PKReader - (instancetype)init { - return [self initWithString:nil]; + self = [super init]; + if (self) { + _bufsize = 256; + } + return self; } - (instancetype)initWithString:(NSString *)s { - self = [super init]; + self = [self init]; if (self) { self.string = s; } @@ -44,7 +101,7 @@ - (instancetype)initWithString:(NSString *)s { - (instancetype)initWithStream:(NSInputStream *)s { - self = [super init]; + self = [self init]; if (self) { self.stream = s; } @@ -55,6 +112,7 @@ - (instancetype)initWithStream:(NSInputStream *)s { - (void)dealloc { self.string = nil; self.stream = nil; + free(self.buffer); [super dealloc]; } @@ -87,31 +145,163 @@ - (void)setStream:(NSInputStream *)s { _length = NSNotFound; } // reset cursor + free(self.buffer); + self.buffer = malloc(sizeof(PKUniChar) * self.bufsize); self.offset = 0; + self.bufOffset = 0; + self.length = 0; } - (PKUniChar)read { - PKUniChar result = PKEOF; - if (_string) { - if (_length && _offset < _length) { - result = [_string characterAtIndex:self.offset++]; + return [self readFromString]; + } + else { + self.offset++; + if (self.length > 0) { + return [self popPKUniCharFromBuffer]; } - } else { - NSUInteger maxLen = 1; // 2 for wide char? - uint8_t c; - if ([_stream read:&c maxLength:maxLen]) { - result = (PKUniChar)c; + else if (self.isStreamInUTF8) { + return [self readFromStreamInUTF8]; + } + else { + return [self readFromStreamAsBytes]; } } - +} + + +-(PKUniChar)readFromString { + if (_length && _offset < _length) { + return [_string characterAtIndex:self.offset++]; + } + else { + return PKEOF; + } +} + + +-(PKUniChar)readFromStreamInUTF8 { + UTF32Char ch32 = [self readUTF32Char]; + if (ch32 == (UTF32Char)-1) { + return PKEOF; + } + + unichar unichars[2]; + BOOL isPair = CFStringGetSurrogatePairForLongCharacter(ch32, unichars); + if (isPair) { + // ch32 is represented by two unichars (two UTF-16 code points). + // Return the first, and put the second in the buffer and unread it so that + // it will be returned next time. + [self addPKUniCharToBuffer:unichars[0]]; + [self addPKUniCharToBuffer:unichars[1]]; + [self unread]; + } + else { + [self addPKUniCharToBuffer:unichars[0]]; + } + return unichars[0]; +} + + +-(PKUniChar)readFromStreamAsBytes { + PKUniChar result = [self readByte]; + if (result != PKEOF) { + [self addPKUniCharToBuffer:result]; + } + return result; +} + + +-(PKUniChar)readByte { + uint8_t c; + if ([self.stream read:&c maxLength:1]) { + return (PKUniChar)c; + } + else { + return PKEOF; + } +} + + +-(UTF32Char)readUTF32Char { + uint8_t bytes[4]; + + NSInteger read = [self.stream read:bytes maxLength:1]; + if (read <= 0) { + return (UTF32Char)-1; + } + size_t seqlen = UTF8SequenceLength(bytes[0]); + size_t byteCount = 1; + +#define LOGGABLE_BYTE(__i) \ + (__i < byteCount ? (unsigned)bytes[__i] : UINT_MAX) +#define LOGGABLE_BYTES \ + LOGGABLE_BYTE(0), LOGGABLE_BYTE(1), LOGGABLE_BYTE(2), LOGGABLE_BYTE(3) + + while (byteCount < seqlen) { + NSInteger read = [self.stream read:(bytes + byteCount) maxLength:1]; + if (read <= 0) { + NSLog(@"Invalid UTF-8 sequence %x%x%x%x followed by EOF", LOGGABLE_BYTES); + return (UTF32Char)-1; + } + byteCount++; + } + if (isLegalUTF8(bytes, seqlen)) { + return readUTF8Sequence(bytes, seqlen); + } + else { + NSLog(@"Invalid UTF-8 sequence %x%x%x%x.", LOGGABLE_BYTES); + return (UTF32Char)-1; + } + +#undef LOGGABLE_BYTE +#undef LOGGABLE_BYTES +} + + +-(void)addPKUniCharToBuffer:(PKUniChar)ch { + assert(ch != PKEOF); + assert(self.length == 0); + + self.buffer[self.bufOffset] = ch; + self.bufOffset++; + if (self.bufOffset >= self.bufsize) { + self.bufOffset = 0; + } +} + + +-(PKUniChar)popPKUniCharFromBuffer { + assert(self.length > 0); + + PKUniChar result = self.buffer[self.bufOffset]; + self.bufOffset++; + if (self.bufOffset >= self.bufsize) { + self.bufOffset = 0; + } + self.length--; return result; } - (void)unread { - self.offset = (0 == _offset) ? 0 : _offset - 1; + if (self.stream) { + assert(self.offset > 0); + self.offset--; + + if (self.bufOffset == 0) { + self.bufOffset = self.bufsize - 1; + } + else { + self.bufOffset--; + } + self.length++; + } + else { + self.offset = (0 == _offset) ? 0 : _offset - 1; + } } @@ -121,4 +311,67 @@ - (void)unread:(NSUInteger)count { } } + +static int UTF8SequenceLength(uint8_t b0) { + if ((b0 & 0x80) == 0) + return 1; + if ((b0 & 0xC0) != 0xC0) + return 0; + if ((b0 & 0xE0) == 0xC0) + return 2; + if ((b0 & 0xF0) == 0xE0) + return 3; + if ((b0 & 0xF8) == 0xF0) + return 4; + return 0; +} + + +// This must be called with the length pre-determined by the first byte (i.e. by UTF8SequenceLength). +// If presented with a length > 4, this returns false. The Unicode +// definition of UTF-8 goes up to 4-byte sequences. +static bool isLegalUTF8(const uint8_t * source, size_t length) { + uint8_t a; + const uint8_t * srcptr = source + length; + switch (length) { + default: return false; + // Everything else falls through when "true"... + case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 2: if ((a = (*--srcptr)) > 0xBF) return false; + + switch (*source) { + // no fall-through in this inner switch + case 0xE0: if (a < 0xA0) return false; break; + case 0xED: if (a > 0x9F) return false; break; + case 0xF0: if (a < 0x90) return false; break; + case 0xF4: if (a > 0x8F) return false; break; + default: if (a < 0x80) return false; + } + + case 1: if (*source >= 0x80 && *source < 0xC2) return false; + } + if (*source > 0xF4) + return false; + return true; +} + + +static const UTF32Char offsetsFromUTF8[4] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL }; + +static UTF32Char readUTF8Sequence(const uint8_t * sequence, size_t length) { + UTF32Char character = 0; + + // The cases all fall through. + switch (length) { + case 4: character += *sequence++; character <<= 6; + case 3: character += *sequence++; character <<= 6; + case 2: character += *sequence++; character <<= 6; + case 1: character += *sequence++; + } + + return character - offsetsFromUTF8[length - 1]; +} + + @end diff --git a/src/PKTokenizer.m b/src/PKTokenizer.m index 1d1cafd..ad0d861 100644 --- a/src/PKTokenizer.m +++ b/src/PKTokenizer.m @@ -274,6 +274,16 @@ - (void)setStream:(NSInputStream *)s { } +-(void)setIsStreamInUTF8:(BOOL)isStreamInUTF8 { + self.reader.isStreamInUTF8 = isStreamInUTF8; +} + + +-(BOOL)isStreamInUTF8 { + return self.reader.isStreamInUTF8; +} + + #pragma mark - - (PKTokenizerState *)tokenizerStateFor:(PKUniChar)c {