Xác định cấu trúc âm tiết tiếng Việt bằng Python

Về mặt lí thuyết thì cấu trúc âm tiết tiếng Việt có thể được miêu tả bằng nhiều cách khác nhau. Nhưng phổ biến nhất vẫn là cấu trúc hai bậc, trong đó bậc 1 gồm âm đầu (onset), vần (rhyme) và thanh điệu (tone), bậc 2 là các thành phần của vần, gồm: âm đệm (glide), âm chính (nucleus) và âm cuối (coda).

Hệ thống chữ quốc ngữ cũng khá tương thích với mô hình này, trừ một số trường hợp ngoại lệ.

Trong bài này, anh dùng Python để xác định cấu trúc âm tiết tiếng Việt thể hiện bằng chữ quốc ngữ. Kết quả trả về sẽ là một list gồm:

âm tiết
thanh điệu (từ 1 đến 6 tương ứng với không, huyền, hỏi, ngã, sắc, nặng)
âm đầu
âm đệm
âm chính
âm cuối
vần

Nếu âm tiết không hợp lệ thì sẽ trả về kết quả False.

import re
import vos
from pprint import pprint

def syllable_parser(syllable, verbose = False):

    """ Parse a Vietnamese syllable and 
        return a list of phonemes and the rhyme

    Parameters
    ----------
    syllable: string, required
        To be parsed syllable
    verbose: boolean, default False
        Show warnings or not

    Returns
    -------
    list
        If the syllable is valid
        [
            syllable,
            tone,
            onset,
            glide,
            nucleus,
            coda,
            rhyme
        ]
    False
        If the syllable is invalid
    """

    # Quick validation
    syllable = syllable.strip()

    pattern = "[aàảãáạăằẳẵắặâầẩẫấậeèẻẽéẹêềểễếệiìỉĩíịoòỏõóọôồổỗốộơờởỡớợuùủũúụưừửữứựyỳỷỹýỵ]"

    if re.search(rf"^([b-zđ]+)?{pattern}{{1,7}}([c-y]+)?$", syllable, re.I) is None:
        if verbose:
            print('Not look like a syllable')
        return False

    # Looking good, let's play!
    #

    # Correct I/Y and convert the string to lower case
    syllable = vos.vos_iy(syllable.lower())

    # Exceptional syllables
    if syllable == 'gịa':
        return ['gịa', '6', 'gi', '', 'ia', '', 'ia']
    if syllable == 'quốc':
        return ['quốc', '5', 'q', '', 'uô', 'c', 'uôc']

    # Which tone?
    #
    tones = {
        '2': '[àằầèềìòồờùừỳ]',
        '3': '[ảẳẩẻểỉỏổởủửỷ]',
        '4': '[ãẵẫẽễĩõỗỡũữỹ]',
        '5': '[áắấéếíóốớúứý]',
        '6': '[ạặậẹệịọộợụựỵ]',
    }
    # Unmarked tone
    tone = '1'

    for key, val in tones.items():
        if re.search(rf"{val}", syllable):
            tone = key
            break # The tone is determined

    # Which segmental phonemes?
    #
    # First, remove tone mark from the syllable
    syllable_ = vos.remove_tonemark(syllable)

    # Then make the regex pattern and search
    sets = {
        'onset': ['b', 'c', 'ch', 'd', 'đ', 'g', 'gh', 'gi', 'h',
                'k', 'kh', 'l', 'm', 'n', 'ng', 'ngh', 'nh', 'p',
                'ph', 'qu', 'r', 's', 't', 'th', 'tr', 'v', 'x'],

        'coda': ['c', 'ch', 'i', 'm', 'n', 'ng', 'nh', 'o', 'p', 't', 'u', 'y']
    }

    onsets = ('|').join(sets['onset'])
    glides = '[uo]'
    nucleuses = '[aăâeêioôơuưy]|[iy]ê|ươ|[iyuư]a|[uô]ô|oo'
    codas = ('|').join(sets['coda'])

    pattern = re.compile(rf"\b({onsets})?({glides})?({nucleuses})({codas})?\b")

    s = pattern.search(syllable_)

    # The search result is not None
    if s:
        # The segmental phonemes
        _onset = s[1] or ''
        _glide = s[2] or ''
        _nucleus = s[3]
        _coda = s[4] or ''

        # But there are some exceptions
        #
        # Q-
        if _onset == 'qu':
            _onset = 'q'
            _glide = 'u'
            # QUÔ-
            if _nucleus == 'ô':
                return False

        # GI-
        if _onset == 'g' and _nucleus in ['i', 'ia', 'iê']:
            _onset = 'gi'
            # GIA-
            if _nucleus == 'ia':
                _nucleus = 'a'

        # -UA, -UÔ-
        if _glide == 'u' and _nucleus in ['a', 'ô'] and _onset != 'q':
            _glide = ''
            _nucleus = 'u' + _nucleus

        # -OO-
        if _glide == 'o' and _nucleus == 'o':
            _glide = ''
            _nucleus = 'oo'
        
        # -UI, -OI
        if _glide != '' and _nucleus == 'i':
            _coda = _nucleus
            _nucleus = _glide
            _glide = ''

        # Optional warning
        if tone not in ['5', '6'] and _coda in ['c', 'ch', 'p', 't'] and verbose:
            print("Invalid tone mark")

        # The rhyme is a combinaton of the last three segmental phonemes
        rhyme = _glide + _nucleus + _coda

        # End result
        return [syllable, tone, _onset, _glide, _nucleus, _coda, rhyme]

    # If None searched, the syllable is invalid
    return False

Trong hàm trên, anh dùng hai hàm riêng:

vos.vos_iy() chuẩn hoá cách viết I/Y (xem thêm ở Dự án S)
vos.remove_tonemark() loại bỏ dấu thanh điệu (đã giới thiệu ở Lựợm.TV)

Chạy thử:

for syllable in ['gì', 'giêng', 'gịa', 'giạ', 'giáo', 'nghiêng', 
    'thuyền', 'ăn', 'oà', 'ủa', 'quốc', 'quộc', 'cuộc',
    'muynh', 'toảt', 'time', 'quyét', 'international', 'tì m']:

    print(syllable.upper())
    pprint(syllable_parser(syllable, True))
    print('-' * 7)

cho ra kết quả:

GÌ
['gì', '2', 'gi', '', 'i', '', 'i']
-------
GIÊNG
['giêng', '1', 'gi', '', 'iê', 'ng', 'iêng']
-------
GỊA
['gịa', '6', 'gi', '', 'ia', '', 'ia']
-------
GIẠ
['giạ', '6', 'gi', '', 'a', '', 'a']
-------
GIÁO
['giáo', '5', 'gi', '', 'a', 'o', 'ao']
-------
NGHIÊNG
['nghiêng', '1', 'ngh', '', 'iê', 'ng', 'iêng']
-------
THUYỀN
['thuyền', '2', 'th', 'u', 'yê', 'n', 'uyên']
-------
ĂN
['ăn', '1', '', '', 'ă', 'n', 'ăn']
-------
OÀ
['oà', '2', '', 'o', 'a', '', 'oa']
-------
ỦA
['ủa', '3', '', '', 'ua', '', 'ua']
-------
QUỐC
['quốc', '5', 'q', '', 'uô', 'c', 'uôc']
-------
QUỘC
False
-------
CUỘC
['cuộc', '6', 'c', '', 'uô', 'c', 'uôc']
-------
MUYNH
['muynh', '1', 'm', 'u', 'y', 'nh', 'uynh']
-------
TOẢT
Invalid tone mark
['toảt', '3', 't', 'o', 'a', 't', 'oat']
-------
TIME
False
-------
QUYÉT
False
-------
INTERNATIONAL
Not look like a syllable
False
-------
TÌ M
Not look like a syllable
False
-------

Cho đến nay thì anh vẫn chưa biết ứng dụng cái này làm gì ngoài việc tìm âm tiết cùng vần.

Xác định cấu trúc âm tiết tiếng Việt bằng Python

Một bình luận