EmoLanguage/decode.py at main · a904guy/EmoLanguage · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
"""Emoji-to-Text Decoder

This module provides context-aware decoding of emoji sequences back to English text.
It includes grammar reconstruction capabilities to restore proper word forms based
on surrounding context (plurals, tenses, comparatives, etc.).

The decoder uses pattern matching and context analysis to intelligently reconstruct
original text while handling complex emoji combinations and contextual modifiers.
"""

# Standard library imports
import re
from typing import Dict, List, Optional, Set, Tuple, Union

# Local imports
from lib.config import (
    CHARACTER_FALLBACK_REVERSE,
    MORPHOLOGICAL_MODIFIERS_REVERSE,
    PLURAL_CONTEXTS,
    PAST_CONTEXTS,
    FUTURE_CONTEXTS,
    IRREGULAR_COMPARATIVES,
    IRREGULAR_SUPERLATIVES,
    IRREGULAR_PLURALS
)
from lib.emoji_mappings import get_cached_mappings
from lib.morphology import (
    apply_morphological_transformations,
    make_plural_simple
)

# Load mappings at module level for efficiency
word_to_emoji, emoji_to_word = get_cached_mappings()

class ContextualDecoder:
    """Enhanced decoder with context-aware grammar reconstruction.

    This decoder processes emoji sequences and reconstructs proper English text
    by analyzing contextual clues and applying grammatical rules. It handles:
    - Context modifier extraction
    - Grammar form reconstruction (plurals, tenses, comparatives)
    - Multi-emoji sequence parsing
    - Surrounding word context analysis

    Attributes:
        emoji_pattern: Compiled regex for efficient emoji matching
        emoji_keys: Sorted emoji keys for longest-first matching
    """

    def __init__(self) -> None:
        """Initialize the contextual decoder with optimized emoji patterns."""
        # Sort emoji keys by length (longest first) for proper matching
        # This ensures multi-character emoji sequences are matched before single ones
        self.emoji_keys = sorted(emoji_to_word.keys(), key=len, reverse=True)

        # Add character fallback emojis to emoji keys for pattern matching
        all_emoji_keys = list(self.emoji_keys) + list(CHARACTER_FALLBACK_REVERSE.keys())

        # Also include morphological modifiers to ensure proper pattern matching
        all_emoji_keys.extend(MORPHOLOGICAL_MODIFIERS_REVERSE.keys())

        # Remove duplicates and sort by length (longest first) for proper matching
        all_emoji_keys = sorted(set(all_emoji_keys), key=len, reverse=True)

        # Compile regex pattern once for performance
        # Use alternation with escaped emoji sequences
        self.emoji_pattern = re.compile(
            "|".join(map(re.escape, all_emoji_keys))
        )

    def extract_morphological_modifiers(self, emoji_sequence: str) -> Tuple[str, Dict[str, str]]:
        """Extract morphological modifiers from emoji sequence.

        Analyzes an emoji sequence to identify and extract morphological modifiers
        that indicate word transformations (plural, tense, comparative, etc.).

        Args:
            emoji_sequence: The emoji sequence to analyze

        Returns:
            Tuple of (base_emoji_sequence, transformations_dict)
        """
        base_emoji = emoji_sequence
        transformations = {}

        # Check for morphological modifier emojis at the end of the sequence
        # Sort by length (longest first) to match multi-emoji modifiers first
        sorted_modifiers = sorted(MORPHOLOGICAL_MODIFIERS_REVERSE.items(), key=lambda x: len(x[0]), reverse=True)

        # Keep looking for modifiers until we can't find any more
        while base_emoji:
            found_modifier = False
            for modifier, meaning in sorted_modifiers:
                if base_emoji.endswith(modifier):
                    base_emoji = base_emoji[:-len(modifier)]
                    transformations[meaning] = modifier
                    found_modifier = True
                    break

            if not found_modifier:
                break

        return base_emoji, transformations

    def apply_grammar_rules(
        self,
        base_word: str,
        context: Dict[str, bool],
        surrounding_words: List[str]
    ) -> str:
        """Apply grammar rules to reconstruct proper word form.

        Transforms the base word according to grammatical context indicators
        and surrounding word patterns to restore the original inflected form.

        Args:
            base_word: The base form of the word to transform
            context: Dictionary of boolean flags indicating grammatical context
            surrounding_words: List of nearby words for additional context

        Returns:
            The grammatically correct form of the word
        """
        if not base_word:
            return base_word

        word = base_word

        # Apply comparative transformations
        if context['is_comparative']:
            word = self._apply_comparative_form(base_word)

        # Apply superlative transformations
        elif context['is_superlative']:
            word = self._apply_superlative_form(base_word)

        # If no comparative/superlative and we have surrounding context indicating plural
        elif not context['is_comparative'] and not context['is_superlative']:
            # Apply plural transformations based on context
            word = self._apply_plural_context(word, surrounding_words)

        # Apply tense transformations (simplified for now)
        word = self._apply_tense_context(word, context, surrounding_words)

        return word

    def _apply_comparative_form(self, base_word: str) -> str:
        """Apply comparative form transformation.

        Args:
            base_word: Base word to transform

        Returns:
            Comparative form of the word
        """
        # Handle irregular comparatives
        if base_word in IRREGULAR_COMPARATIVES:
            return IRREGULAR_COMPARATIVES[base_word]

        # Apply regular comparative rules
        if base_word.endswith('y'):
            return base_word[:-1] + 'ier'
        elif base_word.endswith('e'):
            return base_word + 'r'
        else:
            return base_word + 'er'

    def _apply_superlative_form(self, base_word: str) -> str:
        """Apply superlative form transformation.

        Args:
            base_word: Base word to transform

        Returns:
            Superlative form of the word
        """
        # Handle irregular superlatives
        if base_word in IRREGULAR_SUPERLATIVES:
            return IRREGULAR_SUPERLATIVES[base_word]

        # Apply regular superlative rules
        if base_word.endswith('y'):
            return base_word[:-1] + 'iest'
        elif base_word.endswith('e'):
            return base_word + 'st'
        else:
            return base_word + 'est'

    def _apply_plural_context(self, word: str, surrounding_words: List[str]) -> str:
        """Apply plural transformation based on surrounding context.

        Args:
            word: Current word form
            surrounding_words: List of nearby words for context

        Returns:
            Pluralized word if context indicates plural usage
        """
        # Check if plural context is detected from surrounding words
        plural_detected = any(
            ctx.lower() in PLURAL_CONTEXTS
            for ctx in surrounding_words
        )

        if plural_detected and not word.endswith(('s', 'es')):
            return make_plural_simple(word)

        return word

    def _make_plural(self, word: str) -> str:
        """Convert a word to its plural form.

        Args:
            word: Singular word to pluralize

        Returns:
            Plural form of the word
        """
        # Handle irregular plurals
        if word in IRREGULAR_PLURALS:
            return IRREGULAR_PLURALS[word]

        # Apply regular plural rules
        if word.endswith('y') and len(word) > 1 and word[-2] not in 'aeiou':
            return word[:-1] + 'ies'
        elif word.endswith(('s', 'sh', 'ch', 'x', 'z')):
            return word + 'es'
        elif word.endswith('f'):
            return word[:-1] + 'ves'
        elif word.endswith('fe'):
            return word[:-2] + 'ves'
        else:
            return word + 's'

    def _apply_tense_context(
        self,
        word: str,
        context: Dict[str, bool],
        surrounding_words: List[str]
    ) -> str:
        """Apply tense transformation based on context (simplified implementation).

        Args:
            word: Current word form
            context: Grammatical context flags
            surrounding_words: List of nearby words for context

        Returns:
            Word with appropriate tense (currently returns unchanged)

        Note:
            This is a simplified implementation. Full tense reconstruction
            would require more sophisticated morphological analysis.
        """
        # Detect past tense indicators
        past_detected = (
            context['is_past'] or
            any(ctx.lower() in PAST_CONTEXTS for ctx in surrounding_words)
        )

        # Future tense detection
        future_detected = (
            context['is_future'] or
            any(ctx.lower() in FUTURE_CONTEXTS for ctx in surrounding_words)
        )

        # TODO: Implement proper tense reconstruction
        # This would require a more sophisticated approach with verb conjugation
        # rules and irregular verb handling

        return word

    def decode_with_context(self, text: str) -> str:
        """Decode emoji text with morphological reconstruction.

        Processes emoji sequences in the input text and reconstructs proper
        English text by applying stored morphological transformations.

        Args:
            text: Input text containing emoji sequences to decode

        Returns:
            Decoded English text with proper morphological forms

        Note:
            This method preserves spacing and handles multi-emoji sequences
            while reconstructing original word forms from base + modifier.
        """
        if not text:
            return text

        # Tokenize input while preserving whitespace
        tokens = re.findall(r'\s+|[^\s]+', text)
        output = []

        for token in tokens:
            if token.isspace():
                output.append(token)
                continue

            # Process non-whitespace tokens for emoji sequences
            decoded_token = self._decode_token_with_underscore_handling(token)
            output.append(decoded_token)

        return ''.join(output)

    def _decode_token(self, token: str) -> str:
        """Decode a single token that may contain emoji sequences.

        Args:
            token: Token to decode (no whitespace)

        Returns:
            Decoded token with reconstructed words
        """
        if not token:
            return token

        # Check if this token is primarily character fallback sequence
        if self._is_character_fallback_sequence(token):
            return self._decode_character_fallback_token(token)

        decoded_parts = []
        remaining = token

        # Parse token for emoji sequences, prioritizing base+modifier combinations
        while remaining:
            # Try to find the longest emoji sequence that starts here
            best_match = None
            best_length = 0
            best_match_info = None

            # First check for character emoji sequences (with potential modifiers)
            char_emoji = self._find_character_emoji(remaining)
            if char_emoji:
                # Found a character emoji, check for modifiers after it
                potential_sequence = char_emoji
                after_char = remaining[len(char_emoji):]

                # Look for morphological modifiers immediately after
                for modifier in sorted(MORPHOLOGICAL_MODIFIERS_REVERSE.keys(), key=len, reverse=True):
                    if after_char.startswith(modifier):
                        potential_sequence = char_emoji + modifier
                        break

                # Character emoji sequences get priority
                if len(potential_sequence) > best_length:
                    best_match = potential_sequence
                    best_length = len(potential_sequence)

            # Then check dictionary emoji sequences
            for emoji_key in self.emoji_keys:
                if remaining.startswith(emoji_key):
                    # Check if this emoji could be a base for a modifier
                    potential_sequence = emoji_key
                    after_base = remaining[len(emoji_key):]
                    found_modifiers = []

                    # Look for multiple morphological modifiers immediately after
                    # Keep looking for modifiers until we can't find any more
                    remaining_after_base = after_base
                    while remaining_after_base:
                        found_modifier = False
                        # Try to find a modifier at the start of the remaining text
                        for modifier in sorted(MORPHOLOGICAL_MODIFIERS_REVERSE.keys(), key=len, reverse=True):
                            if remaining_after_base.startswith(modifier):
                                potential_sequence += modifier
                                found_modifiers.append(modifier)
                                remaining_after_base = remaining_after_base[len(modifier):]
                                found_modifier = True
                                break
                        if not found_modifier:
                            break

                    # Only use if longer than character emoji match (character emojis get priority)
                    if len(potential_sequence) > best_length:
                        best_match = potential_sequence
                        best_length = len(potential_sequence)
                        # Store the base emoji and all modifiers found
                        best_match_info = (emoji_key, found_modifiers)

            if best_match:
                # Process the best emoji sequence we found
                emoji_sequence = best_match

                # Use the stored base emoji and modifier info if available
                if best_match_info:
                    base_emoji, found_modifiers = best_match_info
                    if found_modifiers:
                        # Create transformations dict from all modifiers we found
                        transformations = {}
                        for modifier in found_modifiers:
                            modifier_meaning = MORPHOLOGICAL_MODIFIERS_REVERSE[modifier]
                            transformations[modifier_meaning] = modifier
                    else:
                        transformations = {}
                else:
                    # Fallback to the old method
                    base_emoji, transformations = self.extract_morphological_modifiers(emoji_sequence)

                # First, try character fallback decoding (for sequences of character emojis)
                character_decoded = self._decode_character_sequence(emoji_sequence, transformations)
                if character_decoded:
                    decoded_parts.append(character_decoded)
                else:
                    # Look up base word from emoji mapping
                    base_word = emoji_to_word.get(base_emoji)

                    if base_word:
                        # Apply morphological transformations to reconstruct original form
                        reconstructed_word = self._apply_morphological_transformations(
                            base_word, transformations
                        )
                        decoded_parts.append(reconstructed_word)
                    else:
                        # Unknown emoji sequence, preserve it
                        decoded_parts.append(emoji_sequence)

                remaining = remaining[len(emoji_sequence):]
            else:
                # Check for individual character emojis (fallback decoding)
                char_emoji = self._find_character_emoji(remaining)
                if char_emoji:
                    # Decode single character emoji
                    char = CHARACTER_FALLBACK_REVERSE[char_emoji]
                    decoded_parts.append(char)
                    remaining = remaining[len(char_emoji):]
                else:
                    # No emoji match found - preserve the character
                    decoded_parts.append(remaining[0])
                    remaining = remaining[1:]

        # Join all parts together without spaces
        # This preserves punctuation directly attached to words
        return ''.join(decoded_parts)

    def _is_character_fallback_sequence(self, token: str) -> bool:
        """Check if a token is primarily a character fallback sequence.

        Args:
            token: Token to analyze

        Returns:
            True if token appears to be character fallback encoding
        """
        # Check if most of the token consists of character emojis
        total_length = len(token)
        character_emoji_length = 0
        remaining = token

        while remaining:
            char_emoji = self._find_character_emoji(remaining)
            if char_emoji:
                character_emoji_length += len(char_emoji)
                remaining = remaining[len(char_emoji):]
            else:
                # Skip over morphological modifiers
                found_modifier = False
                for modifier in sorted(MORPHOLOGICAL_MODIFIERS_REVERSE.keys(), key=len, reverse=True):
                    if remaining.startswith(modifier):
                        remaining = remaining[len(modifier):]
                        found_modifier = True
                        break
                if not found_modifier:
                    break

        # If 50% or more of the token is character emojis, treat as character sequence
        return character_emoji_length >= total_length * 0.5

    def _decode_character_fallback_token(self, token: str) -> str:
        """Decode a token that's primarily character fallback encoding.

        This method handles character sequences with per-character modifiers correctly,
        avoiding inappropriate word-level morphological transformations.

        Args:
            token: Token to decode (expected to be character fallback)

        Returns:
            Decoded text with proper character handling
        """
        result = []
        remaining = token

        while remaining:
            # Look for character emoji
            char_emoji = self._find_character_emoji(remaining)
            if char_emoji:
                char = CHARACTER_FALLBACK_REVERSE[char_emoji]
                remaining = remaining[len(char_emoji):]

                # Check for capitalization modifier immediately after
                if remaining.startswith('🔠'):
                    char = char.upper()
                    remaining = remaining[len('🔠'):]

                result.append(char)
            else:
                # Check for standalone modifiers (these should be consumed with chars)
                found_modifier = False
                for modifier, meaning in MORPHOLOGICAL_MODIFIERS_REVERSE.items():
                    if remaining.startswith(modifier):
                        # Skip all morphological modifiers that don't apply to character sequences
                        # Capitalization modifiers should only be applied immediately after character emojis
                        remaining = remaining[len(modifier):]
                        found_modifier = True
                        break

                if not found_modifier:
                    # Preserve unknown characters
                    result.append(remaining[0])
                    remaining = remaining[1:]

        return ''.join(result)

    def _find_character_emoji(self, text: str) -> Optional[str]:
        """Find character emoji at the beginning of text.

        Args:
            text: Text to search for character emoji

        Returns:
            Character emoji if found, None otherwise
        """
        for emoji_char in sorted(CHARACTER_FALLBACK_REVERSE.keys(), key=len, reverse=True):
            if text.startswith(emoji_char):
                return emoji_char
        return None

    def _decode_character_sequence(self, emoji_sequence: str, transformations: Dict[str, str]) -> Optional[str]:
        """Decode a sequence that might be character-by-character encoding.

        Args:
            emoji_sequence: Emoji sequence to decode
            transformations: Any morphological transformations found

        Returns:
            Decoded character sequence if it's a character fallback, None otherwise
        """
        # Remove morphological modifiers to get the base emoji sequence
        base_emoji = emoji_sequence
        for modifier in transformations.values():
            if base_emoji.endswith(modifier):
                base_emoji = base_emoji[:-len(modifier)]

        # Try to decode as character-by-character sequence
        decoded_chars = []
        remaining = base_emoji

        while remaining:
            char_emoji = self._find_character_emoji(remaining)
            if char_emoji:
                decoded_chars.append(CHARACTER_FALLBACK_REVERSE[char_emoji])
                remaining = remaining[len(char_emoji):]
            else:
                # If any character can't be decoded, this isn't a character sequence
                return None

        if decoded_chars:
            # Reconstruct the word with transformations
            word = ''.join(decoded_chars)

            # Apply morphological transformations (including capitalization)
            reconstructed_word = self._apply_morphological_transformations(word, transformations)

            return reconstructed_word

        return None

    def _fix_subject_verb_agreement(self, tokens: List[str]) -> List[str]:
        """Fix subject-verb agreement, particularly for plurals.

        Args:
            tokens: List of decoded tokens (words and spaces)

        Returns:
            List of tokens with corrected subject-verb agreement
        """
        result = tokens.copy()

        # Find word tokens (skip spaces)
        word_positions = []
        for i, token in enumerate(tokens):
            if not token.isspace() and token.strip().isalpha():
                word_positions.append((i, token.strip().lower()))

        # Look for plural verb patterns and fix preceding subjects
        for i, (pos, word) in enumerate(word_positions):
            if word in PLURAL_CONTEXTS:  # Found a plural verb like "are"
                # Look for the subject (previous word)
                if i > 0:
                    prev_pos, prev_word = word_positions[i-1]
                    # Pluralize the subject if it's not already plural
                    if not prev_word.endswith(('s', 'es')):
                        pluralized = self._make_plural(prev_word)
                        result[prev_pos] = pluralized

        return result

    def _apply_morphological_transformations(self, base_word: str, transformations: Dict[str, str]) -> str:
        """Apply multiple morphological transformations to reconstruct original word form.

        Args:
            base_word: Base/normalized word form
            transformations: Dictionary of transformations to apply

        Returns:
            Reconstructed word with applied transformations
        """
        return apply_morphological_transformations(base_word, transformations)

    def _decode_token_with_underscore_handling(self, token: str) -> str:
        """Decode a token that may contain underscore-separated emoji sequences.

        This method handles cases like 'Ⓚ🔠ⓊⓁⓉ_🎭🤹‍♂️🔠' by:
        1. Checking if the token contains underscores
        2. Splitting on underscores and decoding each part separately
        3. Preserving the original structure with underscores

        Args:
            token: Token that may contain underscore-separated emoji sequences

        Returns:
            Decoded token with proper morphological reconstruction
        """
        # Check if token contains underscores
        if '_' not in token:
            return self._decode_token(token)

        # Split on underscores and process each part
        parts = token.split('_')
        decoded_parts = []

        for part in parts:
            # Decode each part separately
            if part:  # Skip empty parts
                decoded_part = self._decode_token(part)
                decoded_parts.append(decoded_part)
            else:
                decoded_parts.append(part)

        # Rejoin with underscores
        return '_'.join(decoded_parts)

    def _make_plural_s(self, word: str) -> str:
        """Add simple 's' plural."""
        return word + 's'

    def _make_plural_es(self, word: str) -> str:
        """Add 'es' plural."""
        return word + 'es'

    def _make_plural_ies(self, word: str) -> str:
        """Convert 'y' to 'ies' plural."""
        if word.endswith('y'):
            return word[:-1] + 'ies'
        return word + 'ies'

    def _make_irregular_plural(self, word: str) -> str:
        """Handle irregular plurals."""
        irregular_plurals = {
            'child': 'children', 'foot': 'feet', 'tooth': 'teeth',
            'goose': 'geese', 'mouse': 'mice', 'man': 'men',
            'woman': 'women', 'person': 'people', 'ox': 'oxen'
        }
        return irregular_plurals.get(word, word + 's')

    def _make_verb_s(self, word: str) -> str:
        """Add 's' for 3rd person singular verb."""
        # Handle irregular verb forms that are already in 3rd person singular
        irregular_verb_forms = {
            'does': 'does',  # does is already 3rd person singular of do
            'goes': 'goes',  # goes is already 3rd person singular of go
            'has': 'has',    # has is already 3rd person singular of have
            'is': 'is',      # is is already 3rd person singular of be
        }

        # If the word is already in irregular 3rd person form, return as-is
        if word in irregular_verb_forms:
            return word

        # Handle irregular verb base forms that need special 3rd person forms
        irregular_verb_bases = {
            'do': 'does',
            'go': 'goes',
            'have': 'has',
            'be': 'is',
        }

        if word in irregular_verb_bases:
            return irregular_verb_bases[word]

        # Apply regular verb_s transformation rules
        if word.endswith(('s', 'sh', 'ch', 'x', 'z')):
            return word + 'es'
        elif word.endswith('y') and len(word) > 1 and word[-2] not in 'aeiou':
            return word[:-1] + 'ies'
        else:
            return word + 's'

    def _make_verb_ed(self, word: str) -> str:
        """Add 'ed' for past tense."""
        if word.endswith('e'):
            return word + 'd'
        elif word.endswith('y') and len(word) > 1 and word[-2] not in 'aeiou':
            return word[:-1] + 'ied'
        else:
            return word + 'ed'

    def _make_verb_ing(self, word: str) -> str:
        """Add 'ing' for progressive form."""
        if word.endswith('e') and len(word) > 2:
            return word[:-1] + 'ing'
        else:
            return word + 'ing'

    def _make_comparative(self, word: str) -> str:
        """Make comparative form."""
        # Handle irregular comparatives
        if word in IRREGULAR_COMPARATIVES:
            return IRREGULAR_COMPARATIVES[word]

        # Regular comparative rules
        if word.endswith('y'):
            return word[:-1] + 'ier'
        elif word.endswith('e'):
            return word + 'r'
        else:
            return word + 'er'

    def _make_superlative(self, word: str) -> str:
        """Make superlative form."""
        # Handle irregular superlatives
        if word in IRREGULAR_SUPERLATIVES:
            return IRREGULAR_SUPERLATIVES[word]

        # Regular superlative rules
        if word.endswith('y'):
            return word[:-1] + 'iest'
        elif word.endswith('e'):
            return word + 'st'
        else:
            return word + 'est'

    def _make_adverb_ly(self, word: str) -> str:
        """Make adverb with 'ly' suffix."""
        if word.endswith('y'):
            return word[:-1] + 'ily'
        else:
            return word + 'ly'

# Create global decoder instance for module-level functions
contextual_decoder = ContextualDecoder()


def decode(text: str) -> str:
    """Decode emoji text using context-aware grammar reconstruction.

    This is the main public interface for decoding emoji sequences back
    to English text with intelligent grammar reconstruction.

    Args:
        text: Input text containing emoji sequences

    Returns:
        Decoded English text with proper grammar

    Example:
        >>> decode("🐱 🏃➕ 🏠")
        "cat runs home"
    """
    return contextual_decoder.decode_with_context(text)


def decode_simple(text: str) -> str:
    """Simple decoding without context reconstruction (legacy).

    Provides backward compatibility for applications that need simple
    emoji-to-word mapping without grammatical reconstruction.

    Args:
        text: Input text containing emoji sequences

    Returns:
        Decoded text with basic word substitution only

    Note:
        This function is deprecated. Use decode() for better results.
    """
    if not text:
        return text

    # Tokenize input while preserving whitespace
    tokens = re.findall(r'\s+|[^\s]+', text)
    output = []

    for token in tokens:
        if token.isspace():
            output.append(token)
            continue

        decoded = ""
        remaining = token

        # Simple emoji-to-word substitution without context
        while remaining:
            match = contextual_decoder.emoji_pattern.match(remaining)
            if match:
                emoji_seq = match.group(0)
                decoded += emoji_to_word.get(emoji_seq, emoji_seq) + " "
                remaining = remaining[len(emoji_seq):]
            else:
                # No match found, preserve character
                decoded += remaining[0]
                remaining = remaining[1:]

        output.append(decoded.strip())

    return ''.join(output)

def main() -> None:
    """Command-line interface for emoji decoding.

    Supports multiple input methods:
    - Command-line arguments: python decode.py "emojis to decode"
    - Piped input: echo "🐱🏃" | python decode.py
    - Interactive mode: python decode.py (prompts for input)
    """
    import sys

    try:
        if len(sys.argv) > 1:
            # Command-line argument mode
            sample = ' '.join(sys.argv[1:])
            show_labels = True
        elif not sys.stdin.isatty():
            # Piped input mode
            sample = sys.stdin.read().strip()
            show_labels = False  # For piped input, just output the result
        else:
            # Interactive mode
            sample = input('What do you want to decode from Emo? ')
            show_labels = True

        if not sample.strip():
            if show_labels:
                print("No input provided.")
            return

        # Decode using context-aware method
        decoded = decode(sample)

        # Display results
        if show_labels:
            print(f"Original: {sample}")
            print(f"Decoded : {decoded}")
        else:
            # For piped input, just output the decoded result
            print(decoded)

    except KeyboardInterrupt:
        if not sys.stdin.isatty():
            # Don't show interrupt message for piped input
            pass
        else:
            print("\nDecoding interrupted by user.")
    except Exception as e:
        print(f"Error during decoding: {e}", file=sys.stderr)


if __name__ == "__main__":
    main()