Complete rule set for direct implementation. All regex in PCRE / Python re format. Apply re.IGNORECASE everywhere.
import re def causer_stem(spoken: str) -> str: if re.search(r'[!?()]', spoken): return '' s = re.sub(r'^-', '', spoken).lower() if s.count('-') >= 2: return re.sub(r'-([^-]+)-.*$', r'\1', s) return re.sub(r'[,-].*$', '', s) def median_stem(spoken: str) -> str: if re.search(r'[!?()]', spoken): return '' s = re.sub(r'^-', '', spoken).lower() m = re.match(r'(.*)-(.*)-(.*)', s) if m: return m.group(1) + m.group(3) m = re.match(r'(.*)-([^ ]*)(.*)', s) if m: return m.group(1) + m.group(2) return re.sub(r',.*$', '', s)
The Reduplicated column uses its own 16-pattern table. These are completely independent of the Actor/Passor PATTERNS list. Apply to the Causer stem: first match wins.
REDUP_PAIRS = [ (r'^[td]h?([sz]h?)([aeiouy]*)([aeiou])', r't\3d\1\2\3'), (r'^([bdgptkszfv])(h?)([lrsfzv])', r'\1e\1\2\3'), (r'^([sz])(h?)([pbkgtd])', r's\2ez\3'), (r'^[fv]([pbkgtd])', r'fev\1'), (r'^[sz]([aeiou]*)([aeiou])', r's\2z\1\2'), (r'^[fv]([aeiou]*)([aeiou])', r'f\2v\1\2'), (r'^[sz]h([aeiou]*)([aeiou])', r'sh\2zh\1\2'), (r'^(h?)([uwo]*)([ou])', r'\1owo'), (r'^(h?)([iy]*)([aeoiu])', r'\1iya'), (r'^[pb]h?([aeiou]*)([aeiou])', r'p\2b\1\2'), (r'^([nml])([aeiou]*)([aeiou])', r'\1\3\1\2\3'), (r'^[kg]h?([aeiou]*)([aeiou])', r'k\2g\1\2'), (r'^[fv]([nml])', r'fav\1'), (r'^[sz]([nml])', r'saz\1'), (r'^(h?)([ea]*)', r'\1ea'), (r'^', r'hee'), # catch-all ] def apply_reduplicated(causer: str) -> str: for pat, repl in REDUP_PAIRS: if re.search(pat, causer, re.IGNORECASE): return re.sub(pat, repl, causer, re.IGNORECASE) return causer
PATTERNS = [ r'et(t?)$', # [0] B r'([ou])y$', # [1] C r"(y[ou]+)([sf]h?)?$", # [2] D r'([w]+)[ae]+$', # [3] E r'([yi]+)([aeiou]+)([bdgptkmnlrfscvzjh]+)$', # [4] F r'([wuo]+)([aeiou]*)([bdgptkmnrfscvzjh]+)$', # [5] G r'([bdgptkmnlrfscvzjh]+)(([aeiou])([aeiou]))([lr])$', # [6] H r'([bdgptkmnlrfscvzjh]+)([aeiou])([lr])$', # [7] I r"([bdgptkmnlrfscvzjh']+)([ea])([ea]*)$", # [8] J r'([bdgptkmnlrfscvzjh]+)([aeiou]*)([ouw]+)$', # [9] K r'([bdgptkmnlrfscvzjhw]+)([aeiou]*)([iy])$', # [10] L r'([bdgptkmnlrfscvzjh]+)([aeiou]+)([bdgptkmnrfscvzjh]+)$', # [11] M r'()([ae])y()$', # [12] N r'(y)([ae]+)()$', # [13] O r"(^|[''\-])([aeoiu]+)([bpkgtd]+)$", # [14] P r"([''\-])([aeoiu]+)([lr])$", # [15] Q r"([''\-])([aeoiu]+)([lr][lr])$", # [16] R r'$', # [17] S catch-all ] SKIP_R_INDICES = list(range(16)) + [17] # all except [16] ALL_INDICES = list(range(18)) # Actor-There only
# Actor-There (uses ALL_INDICES) REPL_ACTOR_THERE = [ r'et\1a', r'\1ya', r"\1'a\2", r'wawa', r'\1\2\3e', r'\1\2\3e', r'\1\2ra', r'\1\2ra', r'\1\2wa', r'\1\2wa', r'\1\2\3ya', r'\1\2\3e', r'\2ye', r"\1\2\3'a", r'\1\2\3a', r'\1\2ra', r'\1\2\3a', r'a' ] # Actor-Hither (SKIP_R_INDICES) REPL_ACTOR_HITHER = [ r'et\1si', r'\1yi', r"\1'i", r'\1ey', r'\1\2\3i', r'\1\2\3i', r'\1\2li', r'\1\2li', r'\1\2yi', r'\1\2wii', r'\1\2\3yi', r'\1\2\3i', r'\2yi', r"\1\2\3'i", r'\1\2\3i', r'\1\2ri', r'\1\2\3i', r'i' ] # Actor-Hence (SKIP_R_INDICES) REPL_ACTOR_HENCE = [ r'et\1soy', r'\1iyo', r'\1yo\2', r'\1oy', r'\1\2\3oy', r'\1\2\3oy', r'\1\2loy', r'\1\2loy', r'\1\2yo', r'\1\2\3yo', r'\1\2\3yo', r'\1\2\3oy', r'\2iyo', r"\1\2\3'oy", r'\1\2\3oy', r'\1\2roy', r'\1\2\3oy', r'oy' ] # Passor-Here (SKIP_R_INDICES) REPL_PASSOR_HERE = [ r'et\1', r'uy', r'yu\2', r'wee', r'ii\3', r'u\3', r'\1\4\5', r'\1i\3', r'\1ee', r'\1u', r'\1\2i', r'\1i\3', r'ey', r'yee', r'\1e\3', r'\1el', r'\1el', r'' ] # Passor-There (SKIP_R_INDICES) REPL_PASSOR_THERE = [ r'ayt', r'aw', r'yaw\2', r'wea', r'ya\3', r'waw\3', r'\1\4ra', r'\1ea\3', r'\1ewa', r'\1ua', r'\1\2ay', r'\1ea\3', r'ay', r'yaw', r'\1ay\3', r'\1ear', r'\1ear', r'e' ] # Passor-Hither (SKIP_R_INDICES) REPL_PASSOR_HITHER = [ r'iss', r'uiii', r'iyu\2', r'wi', r'yi\3i', r'wii\3', r'\1\4lii', r'\1elii', r'\1ey', r'\1uwi', r'\1eye', r'\1i\3i', r'eyi', r'yei', r'\1i\3i', r'\1iri', r'\1iri', r'i' ] # Passor-Hence (SKIP_R_INDICES) REPL_PASSOR_HENCE = [ r'oss', r'uyu', r'uyu\2', r'wu', r'ya\3u', r'wi\3o', r'\1\5aw', r'\1\3aw', r'\1oy', r'\1oyo', r'\1yu', r'\1o\3u', r'eyu', r'yu', r'\1o\3u', r'\1oru', r'\1oru', r'u' ]
def apply_with_ai_at_7(q, ai, patterns, replacements, indices, flags=re.IGNORECASE): for idx in indices: if not patterns[idx]: continue if re.search(patterns[idx], q, flags): target = ai if idx == 7 else q return re.sub(patterns[idx], replacements[idx], target, flags=flags) return q def apply_passor_hence(q, ai, patterns, replacements, indices, flags=re.IGNORECASE): for idx in indices: if not patterns[idx]: continue if idx == 7: if re.search(patterns[7], ai, flags): return re.sub(patterns[7], replacements[7], ai, flags=flags) else: if re.search(patterns[idx], q, flags): return re.sub(patterns[idx], replacements[idx], q, flags=flags) return q def apply_case_ai(ai, patterns, replacements, indices, flags=re.IGNORECASE): for idx in indices: if not patterns[idx]: continue if re.search(patterns[idx], ai, flags): return re.sub(patterns[idx], replacements[idx], ai, flags=flags) return ai def apply_person(form, pairs, flags=re.IGNORECASE): for pat, repl in pairs: if re.search(pat, form, flags): return re.sub(pat, repl, form, flags=flags, count=1) return form
ME_PAIRS = [("([ao]|[ae]e)$", r'\1ni'), ("ii$", 'iin'), ("[uw]$", 'win'), ("e?$", 'in')] YOU_PAIRS = [("([^aeiouwyn])$", r'\1ets'), ("([^aeiou]*[aeiou]+[^aeiou]+[aeiou]+[^aeiou]*[aeiou]+)$", r'\1ts'), ("$", 'tse')] THEM_PAIRS = [("([aeou])$", r'\1rh'), ("([^i][wyi])$", r'\1irh'), ("$", 'erh')]
PORTMANTEAUS = { ('me', 'there'): ('in$', 'inia', 'ye'), ('you', 'there'): ('tse$', 'tsa', 'a'), ('them', 'there'): ('e?[ei]rh$', 'earh', 'a'), ('me', 'hither'): ('in$', 'inneye', 'yi'), ('you', 'hither'): ('tse$', 'tsi', 'i'), ('them', 'hither'): ('[ei]rh$', 'eyerh', 'i'), ('me', 'hence'): ('in$', 'inyo', 'yo'), ('you', 'hence'): ('tse$', 'tsoy', 'oy'), ('them', 'hence'): ('[ei]rh$', 'iyorh', 'yo'), } def apply_portmanteau(here_person_form, person, direction, flags=re.IGNORECASE): pat, sub, fallback = PORTMANTEAUS[(person, direction)] if re.search(pat, here_person_form, flags): return re.sub(pat, sub, here_person_form, flags=flags) return here_person_form + fallback
CAUSATIVE_ME_PAIRS = [("([mn]*|ng)$", 'niya'), ("([aeiouwy])$", r'\1niya'), ("$", 'iniya')] CAUSATIVE_YOU_PAIRS = [("(m+|ng)$", 'ntaya'), ("([dt])+$", 'ttsaya'), ("([aeiourlhzsyw])$", r'\1taya'), ("([pbkg])$", r'\1saya'), ("$", 'etaya')] CAUSATIVE_THEM_PAIRS = [("([aeiouwyrh])r?$", r'\1rheya'), ("$", 'erheya')] PRESENT_ACTIVE_PAIRS = [("oo$", 'waam'), ("[eoa]+$", 'aam'), ("([iu])$", r'\1yaam'), ("$", 'aam')] CHEERS_PAIRS = [("([aeou])[iy]$", r'\1iyets!'), ("([^aeiou])$", r'\1eyets!'), ("[ou]+$", 'oyets!'), ("[aei]*$", 'eyets!'), ("$", 'eyets!')]
IMP_PATTERNS = [ r'^(h?([aeiouyw])|[pbvf][pb]?h?)', r'^[sz](h?)([aeiou])', r'^h?[aeiou]?(([sz])|[dt][td]?)h?', r'^h?[aeiou]?([gk][gk]?h?|[sz]h)', r'^h?[aeiou]?(([sz])|[dt][td]?)h?', r'^[m]([aeiou])', r'^h?', # catch-all ] IMP_REPLS = { 'mild': [r'ipp\2', r'itts\1\2', r'itt\2', 'ikk', r'itt\2', r'ibb\1', 'ippe' ], 'regular': [r'epp\2', r'etts\1\2', r'ett\2', 'ekk', r'ett\2', r'ebb\1', 'eppe' ], 'strong': [r'app\2', r'atts\1\2', r'att\2', 'akk', r'att\2', r'abb\1', 'appe' ], 'silly_int': [r'ayopp\2', r'ayotts\1\2',r'ayott\2','ayokk', r'ayott\2',r'ayobb\1','ayoppe' ], 'dism_int': [r'eumb\2', r'eundz\1\2', r'eund\2', 'eung', r'eund\2', r'eumb\1', 'eumbe' ], 'mild_int': [r'iyepp\2', r'iyetts\1\2',r'iyyett\2','iyyekk',r'iyyett\2',r'iyyebb\1','iyyeppe'], 'reg_int': [r'eyapp\2', r'eyatts\1\2',r'eyyatt\2','eyyakk',r'eyyatt\2',r'eyyabb\1','eyyappe'], 'most_int': [r'ayapp\2', r'ayatts\1\2',r'ayyatt\2','ayyakk',r'ayyatt\2',r'ayyabb\1','ayyappe'], 'silly_imp': [r'opp\2', r'otts\1\2', r'ott\2', 'okk', r'ott\2', r'obb\1', 'oppe' ], 'dism_imp': [r'mb\2', r'ndz\1\2', r'nd\2', 'ng', r'nd\2', r'mb\1', 'mbe' ], }