| Both sides previous revisionPrevious revision | |
| yivalkes:regex:pcre [2026/06/13 15:59] – removed - external edit (Unknown date) A User Not Logged in | yivalkes:regex:pcre [2026/06/13 15:59] (current) – ↷ Page moved and renamed from yivalkes:regex to yivalkes:regex:pcre wikarai |
|---|
| | ====== REGEX SPECIFICATION ====== |
| |
| | Complete rule set for direct implementation. All regex in PCRE / Python re format. Apply re.IGNORECASE everywhere. |
| | |
| | ===== Stem Functions ===== |
| | |
| | <code python> |
| | import re |
| | |
| | def causer_stem(spoken: str) -> str: |
| | if re.search(r'[!?()]', spoken): return '' |
| | s = re.sub(r'^-', '', spoken).lower() |
| | if s.count('-') >= 2: |
| | return re.sub(r'-([^-]+)-.*$', r'\1', s) |
| | return re.sub(r'[,-].*$', '', s) |
| | |
| | def median_stem(spoken: str) -> str: |
| | if re.search(r'[!?()]', spoken): return '' |
| | s = re.sub(r'^-', '', spoken).lower() |
| | m = re.match(r'(.*)-(.*)-(.*)', s) |
| | if m: return m.group(1) + m.group(3) |
| | m = re.match(r'(.*)-([^ ]*)(.*)', s) |
| | if m: return m.group(1) + m.group(2) |
| | return re.sub(r',.*$', '', s) |
| | </code> |
| | |
| | ===== Reduplicated: dedicated onset-echo table ===== |
| | |
| | The Reduplicated column uses its own 16-pattern table. These are completely independent of the Actor/Passor PATTERNS list. Apply to the Causer stem: first match wins. |
| | |
| | <code python> |
| | REDUP_PAIRS = [ |
| | (r'^[td]h?([sz]h?)([aeiouy]*)([aeiou])', r't\3d\1\2\3'), |
| | (r'^([bdgptkszfv])(h?)([lrsfzv])', r'\1e\1\2\3'), |
| | (r'^([sz])(h?)([pbkgtd])', r's\2ez\3'), |
| | (r'^[fv]([pbkgtd])', r'fev\1'), |
| | (r'^[sz]([aeiou]*)([aeiou])', r's\2z\1\2'), |
| | (r'^[fv]([aeiou]*)([aeiou])', r'f\2v\1\2'), |
| | (r'^[sz]h([aeiou]*)([aeiou])', r'sh\2zh\1\2'), |
| | (r'^(h?)([uwo]*)([ou])', r'\1owo'), |
| | (r'^(h?)([iy]*)([aeoiu])', r'\1iya'), |
| | (r'^[pb]h?([aeiou]*)([aeiou])', r'p\2b\1\2'), |
| | (r'^([nml])([aeiou]*)([aeiou])', r'\1\3\1\2\3'), |
| | (r'^[kg]h?([aeiou]*)([aeiou])', r'k\2g\1\2'), |
| | (r'^[fv]([nml])', r'fav\1'), |
| | (r'^[sz]([nml])', r'saz\1'), |
| | (r'^(h?)([ea]*)', r'\1ea'), |
| | (r'^', r'hee'), # catch-all |
| | ] |
| | |
| | def apply_reduplicated(causer: str) -> str: |
| | for pat, repl in REDUP_PAIRS: |
| | if re.search(pat, causer, re.IGNORECASE): |
| | return re.sub(pat, repl, causer, re.IGNORECASE) |
| | return causer |
| | </code> |
| | |
| | ===== 18 Patterns (Python regex strings) ===== |
| | |
| | <code python> |
| | PATTERNS = [ |
| | r'et(t?)$', # [0] B |
| | r'([ou])y$', # [1] C |
| | r"(y[ou]+)([sf]h?)?$", # [2] D |
| | r'([w]+)[ae]+$', # [3] E |
| | r'([yi]+)([aeiou]+)([bdgptkmnlrfscvzjh]+)$', # [4] F |
| | r'([wuo]+)([aeiou]*)([bdgptkmnrfscvzjh]+)$', # [5] G |
| | r'([bdgptkmnlrfscvzjh]+)(([aeiou])([aeiou]))([lr])$', # [6] H |
| | r'([bdgptkmnlrfscvzjh]+)([aeiou])([lr])$', # [7] I |
| | r"([bdgptkmnlrfscvzjh']+)([ea])([ea]*)$", # [8] J |
| | r'([bdgptkmnlrfscvzjh]+)([aeiou]*)([ouw]+)$', # [9] K |
| | r'([bdgptkmnlrfscvzjhw]+)([aeiou]*)([iy])$', # [10] L |
| | r'([bdgptkmnlrfscvzjh]+)([aeiou]+)([bdgptkmnrfscvzjh]+)$', # [11] M |
| | r'()([ae])y()$', # [12] N |
| | r'(y)([ae]+)()$', # [13] O |
| | r"(^|[''\-])([aeoiu]+)([bpkgtd]+)$", # [14] P |
| | r"([''\-])([aeoiu]+)([lr])$", # [15] Q |
| | r"([''\-])([aeoiu]+)([lr][lr])$", # [16] R |
| | r'$', # [17] S catch-all |
| | ] |
| | |
| | SKIP_R_INDICES = list(range(16)) + [17] # all except [16] |
| | ALL_INDICES = list(range(18)) # Actor-There only |
| | </code> |
| | |
| | ===== Replacement Strings per Case ===== |
| | |
| | <code python> |
| | # Actor-There (uses ALL_INDICES) |
| | REPL_ACTOR_THERE = [ |
| | r'et\1a', r'\1ya', r"\1'a\2", r'wawa', r'\1\2\3e', r'\1\2\3e', |
| | r'\1\2ra', r'\1\2ra', r'\1\2wa', r'\1\2wa', r'\1\2\3ya', r'\1\2\3e', |
| | r'\2ye', r"\1\2\3'a", r'\1\2\3a', r'\1\2ra', r'\1\2\3a', r'a' |
| | ] |
| | # Actor-Hither (SKIP_R_INDICES) |
| | REPL_ACTOR_HITHER = [ |
| | r'et\1si', r'\1yi', r"\1'i", r'\1ey', r'\1\2\3i', r'\1\2\3i', |
| | r'\1\2li', r'\1\2li', r'\1\2yi', r'\1\2wii', r'\1\2\3yi', r'\1\2\3i', |
| | r'\2yi', r"\1\2\3'i", r'\1\2\3i', r'\1\2ri', r'\1\2\3i', r'i' |
| | ] |
| | # Actor-Hence (SKIP_R_INDICES) |
| | REPL_ACTOR_HENCE = [ |
| | r'et\1soy', r'\1iyo', r'\1yo\2', r'\1oy', r'\1\2\3oy', r'\1\2\3oy', |
| | r'\1\2loy', r'\1\2loy', r'\1\2yo', r'\1\2\3yo', r'\1\2\3yo', r'\1\2\3oy', |
| | r'\2iyo', r"\1\2\3'oy", r'\1\2\3oy', r'\1\2roy', r'\1\2\3oy', r'oy' |
| | ] |
| | # Passor-Here (SKIP_R_INDICES) |
| | REPL_PASSOR_HERE = [ |
| | r'et\1', r'uy', r'yu\2', r'wee', r'ii\3', r'u\3', |
| | r'\1\4\5', r'\1i\3', r'\1ee', r'\1u', r'\1\2i', r'\1i\3', |
| | r'ey', r'yee', r'\1e\3', r'\1el', r'\1el', r'' |
| | ] |
| | # Passor-There (SKIP_R_INDICES) |
| | REPL_PASSOR_THERE = [ |
| | r'ayt', r'aw', r'yaw\2', r'wea', r'ya\3', r'waw\3', |
| | r'\1\4ra', r'\1ea\3', r'\1ewa', r'\1ua', r'\1\2ay', r'\1ea\3', |
| | r'ay', r'yaw', r'\1ay\3', r'\1ear', r'\1ear', r'e' |
| | ] |
| | # Passor-Hither (SKIP_R_INDICES) |
| | REPL_PASSOR_HITHER = [ |
| | r'iss', r'uiii', r'iyu\2', r'wi', r'yi\3i', r'wii\3', |
| | r'\1\4lii', r'\1elii', r'\1ey', r'\1uwi', r'\1eye', r'\1i\3i', |
| | r'eyi', r'yei', r'\1i\3i', r'\1iri', r'\1iri', r'i' |
| | ] |
| | # Passor-Hence (SKIP_R_INDICES) |
| | REPL_PASSOR_HENCE = [ |
| | r'oss', r'uyu', r'uyu\2', r'wu', r'ya\3u', r'wi\3o', |
| | r'\1\5aw', r'\1\3aw', r'\1oy', r'\1oyo', r'\1yu', r'\1o\3u', |
| | r'eyu', r'yu', r'\1o\3u', r'\1oru', r'\1oru', r'u' |
| | ] |
| | </code> |
| | |
| | ===== Apply Functions ===== |
| | |
| | <code python> |
| | def apply_with_ai_at_7(q, ai, patterns, replacements, indices, flags=re.IGNORECASE): |
| | for idx in indices: |
| | if not patterns[idx]: continue |
| | if re.search(patterns[idx], q, flags): |
| | target = ai if idx == 7 else q |
| | return re.sub(patterns[idx], replacements[idx], target, flags=flags) |
| | return q |
| | |
| | def apply_passor_hence(q, ai, patterns, replacements, indices, flags=re.IGNORECASE): |
| | for idx in indices: |
| | if not patterns[idx]: continue |
| | if idx == 7: |
| | if re.search(patterns[7], ai, flags): |
| | return re.sub(patterns[7], replacements[7], ai, flags=flags) |
| | else: |
| | if re.search(patterns[idx], q, flags): |
| | return re.sub(patterns[idx], replacements[idx], q, flags=flags) |
| | return q |
| | |
| | def apply_case_ai(ai, patterns, replacements, indices, flags=re.IGNORECASE): |
| | for idx in indices: |
| | if not patterns[idx]: continue |
| | if re.search(patterns[idx], ai, flags): |
| | return re.sub(patterns[idx], replacements[idx], ai, flags=flags) |
| | return ai |
| | |
| | def apply_person(form, pairs, flags=re.IGNORECASE): |
| | for pat, repl in pairs: |
| | if re.search(pat, form, flags): |
| | return re.sub(pat, repl, form, flags=flags, count=1) |
| | return form |
| | </code> |
| | |
| | ===== Person Suffix Pairs ===== |
| | |
| | <code python> |
| | ME_PAIRS = [("([ao]|[ae]e)$", r'\1ni'), ("ii$", 'iin'), ("[uw]$", 'win'), ("e?$", 'in')] |
| | YOU_PAIRS = [("([^aeiouwyn])$", r'\1ets'), |
| | ("([^aeiou]*[aeiou]+[^aeiou]+[aeiou]+[^aeiou]*[aeiou]+)$", r'\1ts'), |
| | ("$", 'tse')] |
| | THEM_PAIRS = [("([aeou])$", r'\1rh'), ("([^i][wyi])$", r'\1irh'), ("$", 'erh')] |
| | </code> |
| | |
| | ===== Portmanteau Table ===== |
| | |
| | <code python> |
| | PORTMANTEAUS = { |
| | ('me', 'there'): ('in$', 'inia', 'ye'), |
| | ('you', 'there'): ('tse$', 'tsa', 'a'), |
| | ('them', 'there'): ('e?[ei]rh$', 'earh', 'a'), |
| | ('me', 'hither'): ('in$', 'inneye', 'yi'), |
| | ('you', 'hither'): ('tse$', 'tsi', 'i'), |
| | ('them', 'hither'): ('[ei]rh$', 'eyerh', 'i'), |
| | ('me', 'hence'): ('in$', 'inyo', 'yo'), |
| | ('you', 'hence'): ('tse$', 'tsoy', 'oy'), |
| | ('them', 'hence'): ('[ei]rh$', 'iyorh', 'yo'), |
| | } |
| | |
| | def apply_portmanteau(here_person_form, person, direction, flags=re.IGNORECASE): |
| | pat, sub, fallback = PORTMANTEAUS[(person, direction)] |
| | if re.search(pat, here_person_form, flags): |
| | return re.sub(pat, sub, here_person_form, flags=flags) |
| | return here_person_form + fallback |
| | </code> |
| | |
| | ===== Causative / Present Active / Cheers ===== |
| | |
| | <code python> |
| | CAUSATIVE_ME_PAIRS = [("([mn]*|ng)$", 'niya'), ("([aeiouwy])$", r'\1niya'), ("$", 'iniya')] |
| | CAUSATIVE_YOU_PAIRS = [("(m+|ng)$", 'ntaya'), ("([dt])+$", 'ttsaya'), |
| | ("([aeiourlhzsyw])$", r'\1taya'), ("([pbkg])$", r'\1saya'), ("$", 'etaya')] |
| | CAUSATIVE_THEM_PAIRS = [("([aeiouwyrh])r?$", r'\1rheya'), ("$", 'erheya')] |
| | PRESENT_ACTIVE_PAIRS = [("oo$", 'waam'), ("[eoa]+$", 'aam'), ("([iu])$", r'\1yaam'), ("$", 'aam')] |
| | CHEERS_PAIRS = [("([aeou])[iy]$", r'\1iyets!'), ("([^aeiou])$", r'\1eyets!'), |
| | ("[ou]+$", 'oyets!'), ("[aei]*$", 'eyets!'), ("$", 'eyets!')] |
| | </code> |
| | |
| | ===== Imperative Patterns and Replacements ===== |
| | |
| | <code python> |
| | IMP_PATTERNS = [ |
| | r'^(h?([aeiouyw])|[pbvf][pb]?h?)', |
| | r'^[sz](h?)([aeiou])', |
| | r'^h?[aeiou]?(([sz])|[dt][td]?)h?', |
| | r'^h?[aeiou]?([gk][gk]?h?|[sz]h)', |
| | r'^h?[aeiou]?(([sz])|[dt][td]?)h?', |
| | r'^[m]([aeiou])', |
| | r'^h?', # catch-all |
| | ] |
| | |
| | IMP_REPLS = { |
| | 'mild': [r'ipp\2', r'itts\1\2', r'itt\2', 'ikk', r'itt\2', r'ibb\1', 'ippe' ], |
| | 'regular': [r'epp\2', r'etts\1\2', r'ett\2', 'ekk', r'ett\2', r'ebb\1', 'eppe' ], |
| | 'strong': [r'app\2', r'atts\1\2', r'att\2', 'akk', r'att\2', r'abb\1', 'appe' ], |
| | 'silly_int': [r'ayopp\2', r'ayotts\1\2',r'ayott\2','ayokk', r'ayott\2',r'ayobb\1','ayoppe' ], |
| | 'dism_int': [r'eumb\2', r'eundz\1\2', r'eund\2', 'eung', r'eund\2', r'eumb\1', 'eumbe' ], |
| | 'mild_int': [r'iyepp\2', r'iyetts\1\2',r'iyyett\2','iyyekk',r'iyyett\2',r'iyyebb\1','iyyeppe'], |
| | 'reg_int': [r'eyapp\2', r'eyatts\1\2',r'eyyatt\2','eyyakk',r'eyyatt\2',r'eyyabb\1','eyyappe'], |
| | 'most_int': [r'ayapp\2', r'ayatts\1\2',r'ayyatt\2','ayyakk',r'ayyatt\2',r'ayyabb\1','ayyappe'], |
| | 'silly_imp': [r'opp\2', r'otts\1\2', r'ott\2', 'okk', r'ott\2', r'obb\1', 'oppe' ], |
| | 'dism_imp': [r'mb\2', r'ndz\1\2', r'nd\2', 'ng', r'nd\2', r'mb\1', 'mbe' ], |
| | } |
| | </code> |