“Regex Spellcheck” (step 2a) 😀
#! /bin/sh # Vietnamese-initial-sounds splitter # This script will split the Vietnamese syllables list (named 'LIST') # into initial-sound-based lists. Each list (stored in one *.txt file) # contains syllables that begin with one individual initial sound # (presented in Quốc ngữ writing system). # NOTICE: The initial sound /?-/ is not included here # # If you haven't got 'LIST', you can get it from TuDienVan package: # http://sf.net/project/showfiles.php?group_id=134457&package_id=151085 # # All lists will be moved into this directory tmpdir=c1a # List of initial sounds sounds='ch gh kh ngh ng nh p ph th tr b c d đ g h gi k l m n q r s t v x' # Checking for Vietnamese syllables list if [ ! -f LIST ]; then echo 'Fatal error: LIST not found!'; echo 'Please correct this problem before perform any actions!'; exit fi # Create target directory if not existed if [ ! -d $tmpdir ]; then mkdir $tmpdir fi # Splitting... for i in $sounds; do egrep "^$i" LIST > $tmpdir/$i.txt; done # Enter target directory and rename đ.txt to dd.txt cd $tmpdir mv -f đ.txt dd.txt # OK, now stripping... sed -r -e '/^ch/d' -i c.txt sed -r -e '/^g[hi]/d' -i g.txt sed -r -e '/^kh/d' -i k.txt sed -r -e '/^n[gh]/d' -i n.txt sed -r -e '/^ngh/d' -i ng.txt sed -r -e '/^ph/d' -i p.txt sed -r -e '/^t[hr]/d' -i t.txt # Counting... wc -l *.txt | tail -n 1