itod · ewanmellor · Jul 27, 2014 · Dec 29, 2015
diff --git a/include/PEGKit/PKReader.h b/include/PEGKit/PKReader.h
@@ -78,9 +78,16 @@
 
 /*!
     @property   offset
-    @brief      This reader's current offset in string
+    @brief      This reader's current offset in string or the offset from the start of the stream.
 */
 @property (nonatomic, readonly) NSUInteger offset;
 
+/*!
+    @property   isStreamInUTF8
+    @brief      true if self.stream is parsed as UTF-8, with each result from [self read] being a UTF-16 code point.
+                false if self.stream is unparsed, with each result from [self read] being a byte from the stream.
+ */
+@property (nonatomic) BOOL isStreamInUTF8;
+
 - (NSString *)debugDescription;
 @end
diff --git a/include/PEGKit/PKTokenizer.h b/include/PEGKit/PKTokenizer.h
@@ -176,4 +176,11 @@
 
 @property (nonatomic, readonly) NSUInteger lineNumber;
 @property (nonatomic, assign) id <PKTokenizerDelegate>delegate;
+
+/*!
+    @property   isStreamInUTF8
+    @brief      Passthrough to PKReader.isStreamInUTF8.
+*/
+@property (nonatomic) BOOL isStreamInUTF8;
+
 @end
diff --git a/src/PKReader.m b/src/PKReader.m
@@ -1,7 +1,8 @@
 // The MIT License (MIT)
 // 
 // Copyright (c) 2014 Todd Ditchendorf
-// 
+// Copyright (c) 2014 Ewan Mellor
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy 
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
@@ -20,22 +21,78 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
+/*
+ * isLegalUTF8, UTF8SequenceLength, and readUTF8Sequence are derived
+ * from the code detailed below.  They have been modified to use
+ * different types and to conform to the newer UTF-8 specification
+ * (maximum 4 byte sequences).
+ *
+ * http://opensource.apple.com/source/JavaScriptCore/JavaScriptCore-7534.57.3/wtf/unicode/UTF8.cpp
+ *
+ * Copyright (C) 2007 Apple Inc.  All rights reserved.
+ * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
 #import <PEGKit/PKReader.h>
 
 @interface PKReader ()
 @property (nonatomic) NSUInteger offset;
+@property (nonatomic) NSUInteger bufOffset;
 @property (nonatomic) NSUInteger length;
+
+/*!
+    @property   buffer
+    @brief      A circular buffer holding the last bufsize characters that were read from self.stream.
+
+    This is used to support [self unread]; by moving backwards through this buffer we can unread characters.
+    If self.length == 0, characters are written at self.offset and self.offset is incremented.
+    If self.length > 0, characters are read at self.offset and self.offset is incremented and self.length is decremented.
+    When a character is unread, self.offset is decremented and self.length is incremented.
+    self.offset wraps at 0 and self.bufsize of course.
+
+    This is not used if self.stream is not in use.
+
+    Note that self.offset and self.bufOffset are not the same, because self.offset is defined to be
+    from the start of the stream, but self.bufOffset wraps at the end of the buffer.
+ */
+@property (nonatomic) PKUniChar * buffer;
+@property (nonatomic) NSUInteger bufsize;
 @end
 
 @implementation PKReader
 
 - (instancetype)init {
-    return [self initWithString:nil];
+    self = [super init];
+    if (self) {
+        _bufsize = 256;
+    }
+    return self;
 }
 
 
 - (instancetype)initWithString:(NSString *)s {
-    self = [super init];
+    self = [self init];
     if (self) {
         self.string = s;
     }
@@ -44,7 +101,7 @@ - (instancetype)initWithString:(NSString *)s {
 
 
 - (instancetype)initWithStream:(NSInputStream *)s {
-    self = [super init];
+    self = [self init];
     if (self) {
         self.stream = s;
     }
@@ -55,6 +112,7 @@ - (instancetype)initWithStream:(NSInputStream *)s {
 - (void)dealloc {
     self.string = nil;
     self.stream = nil;
+    free(self.buffer);
     [super dealloc];
 }
 
@@ -87,31 +145,163 @@ - (void)setStream:(NSInputStream *)s {
         _length = NSNotFound;
     }
     // reset cursor
+    free(self.buffer);
+    self.buffer = malloc(sizeof(PKUniChar) * self.bufsize);
     self.offset = 0;
+    self.bufOffset = 0;
+    self.length = 0;
 }
 
 
 - (PKUniChar)read {
-    PKUniChar result = PKEOF;
-
     if (_string) {
-        if (_length && _offset < _length) {
-            result = [_string characterAtIndex:self.offset++];
+        return [self readFromString];
+    }
+    else {
+        self.offset++;
+        if (self.length > 0) {
+            return [self popPKUniCharFromBuffer];
         }
-    } else {
-        NSUInteger maxLen = 1; // 2 for wide char?
-        uint8_t c;
-        if ([_stream read:&c maxLength:maxLen]) {
-            result = (PKUniChar)c;
+        else if (self.isStreamInUTF8) {
+            return [self readFromStreamInUTF8];
+        }
+        else {
+            return [self readFromStreamAsBytes];
         }
     }
-
+}
+
+
+-(PKUniChar)readFromString {
+    if (_length && _offset < _length) {
+        return [_string characterAtIndex:self.offset++];
+    }
+    else {
+        return PKEOF;
+    }
+}
+
+
+-(PKUniChar)readFromStreamInUTF8 {
+    UTF32Char ch32 = [self readUTF32Char];
+    if (ch32 == (UTF32Char)-1) {
+        return PKEOF;
+    }
+
+    unichar unichars[2];
+    BOOL isPair = CFStringGetSurrogatePairForLongCharacter(ch32, unichars);
+    if (isPair) {
+        // ch32 is represented by two unichars (two UTF-16 code points).
+        // Return the first, and put the second in the buffer and unread it so that
+        // it will be returned next time.
+        [self addPKUniCharToBuffer:unichars[0]];
+        [self addPKUniCharToBuffer:unichars[1]];
+        [self unread];
+    }
+    else {
+        [self addPKUniCharToBuffer:unichars[0]];
+    }
+    return unichars[0];
+}
+
+
+-(PKUniChar)readFromStreamAsBytes {
+    PKUniChar result = [self readByte];
+    if (result != PKEOF) {
+        [self addPKUniCharToBuffer:result];
+    }
+    return result;
+}
+
+
+-(PKUniChar)readByte {
+    uint8_t c;
+    if ([self.stream read:&c maxLength:1]) {
+        return (PKUniChar)c;
+    }
+    else {
+        return PKEOF;
+    }
+}
+
+
+-(UTF32Char)readUTF32Char {
+    uint8_t bytes[4];
+
+    NSInteger read = [self.stream read:bytes maxLength:1];
+    if (read <= 0) {
+        return (UTF32Char)-1;
+    }
+    size_t seqlen = UTF8SequenceLength(bytes[0]);
+    size_t byteCount = 1;
+
+#define LOGGABLE_BYTE(__i) \
+    (__i < byteCount ? (unsigned)bytes[__i] : UINT_MAX)
+#define LOGGABLE_BYTES \
+    LOGGABLE_BYTE(0), LOGGABLE_BYTE(1), LOGGABLE_BYTE(2), LOGGABLE_BYTE(3)
+
+    while (byteCount < seqlen) {
+        NSInteger read = [self.stream read:(bytes + byteCount) maxLength:1];
+        if (read <= 0) {
+            NSLog(@"Invalid UTF-8 sequence %x%x%x%x followed by EOF", LOGGABLE_BYTES);
+            return (UTF32Char)-1;
+        }
+        byteCount++;
+    }
+    if (isLegalUTF8(bytes, seqlen)) {
+        return readUTF8Sequence(bytes, seqlen);
+    }
+    else {
+        NSLog(@"Invalid UTF-8 sequence %x%x%x%x.", LOGGABLE_BYTES);
+        return (UTF32Char)-1;
+    }
+
+#undef LOGGABLE_BYTE
+#undef LOGGABLE_BYTES
+}
+
+
+-(void)addPKUniCharToBuffer:(PKUniChar)ch {
+    assert(ch != PKEOF);
+    assert(self.length == 0);
+
+    self.buffer[self.bufOffset] = ch;
+    self.bufOffset++;
+    if (self.bufOffset >= self.bufsize) {
+        self.bufOffset = 0;
+    }
+}
+
+
+-(PKUniChar)popPKUniCharFromBuffer {
+    assert(self.length > 0);
+
+    PKUniChar result = self.buffer[self.bufOffset];
+    self.bufOffset++;
+    if (self.bufOffset >= self.bufsize) {
+        self.bufOffset = 0;
+    }
+    self.length--;
     return result;
 }
 
 
 - (void)unread {
-    self.offset = (0 == _offset) ? 0 : _offset - 1;
+    if (self.stream) {
+        assert(self.offset > 0);
+        self.offset--;
+
+        if (self.bufOffset == 0) {
+            self.bufOffset = self.bufsize - 1;
+        }
+        else {
+            self.bufOffset--;
+        }
+        self.length++;
+    }
+    else {
+        self.offset = (0 == _offset) ? 0 : _offset - 1;
+    }
 }
 
 
@@ -121,4 +311,67 @@ - (void)unread:(NSUInteger)count {
     }
 }
 
+
+static int UTF8SequenceLength(uint8_t b0) {
+    if ((b0 & 0x80) == 0)
+        return 1;
+    if ((b0 & 0xC0) != 0xC0)
+        return 0;
+    if ((b0 & 0xE0) == 0xC0)
+        return 2;
+    if ((b0 & 0xF0) == 0xE0)
+        return 3;
+    if ((b0 & 0xF8) == 0xF0)
+        return 4;
+    return 0;
+}
+
+
+// This must be called with the length pre-determined by the first byte (i.e. by UTF8SequenceLength).
+// If presented with a length > 4, this returns false.  The Unicode
+// definition of UTF-8 goes up to 4-byte sequences.
+static bool isLegalUTF8(const uint8_t * source, size_t length) {
+    uint8_t a;
+    const uint8_t * srcptr = source + length;
+    switch (length) {
+        default: return false;
+            // Everything else falls through when "true"...
+        case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+        case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+        case 2: if ((a = (*--srcptr)) > 0xBF) return false;
+
+            switch (*source) {
+                    // no fall-through in this inner switch
+                case 0xE0: if (a < 0xA0) return false; break;
+                case 0xED: if (a > 0x9F) return false; break;
+                case 0xF0: if (a < 0x90) return false; break;
+                case 0xF4: if (a > 0x8F) return false; break;
+                default:   if (a < 0x80) return false;
+            }
+
+        case 1: if (*source >= 0x80 && *source < 0xC2) return false;
+    }
+    if (*source > 0xF4)
+        return false;
+    return true;
+}
+
+
+static const UTF32Char offsetsFromUTF8[4] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL };
+
+static UTF32Char readUTF8Sequence(const uint8_t * sequence, size_t length) {
+    UTF32Char character = 0;
+
+    // The cases all fall through.
+    switch (length) {
+        case 4: character += *sequence++; character <<= 6;
+        case 3: character += *sequence++; character <<= 6;
+        case 2: character += *sequence++; character <<= 6;
+        case 1: character += *sequence++;
+    }
+
+    return character - offsetsFromUTF8[length - 1];
+}
+
+
 @end
diff --git a/src/PKTokenizer.m b/src/PKTokenizer.m
@@ -274,6 +274,16 @@ - (void)setStream:(NSInputStream *)s {
 }
 
 
+-(void)setIsStreamInUTF8:(BOOL)isStreamInUTF8 {
+    self.reader.isStreamInUTF8 = isStreamInUTF8;
+}
+
+
+-(BOOL)isStreamInUTF8 {
+    return self.reader.isStreamInUTF8;
+}
+
+
 #pragma mark -
 
 - (PKTokenizerState *)tokenizerStateFor:(PKUniChar)c {