/****************************************************************************** * Parser to convert roman transliterations into Unicode Devnagari * Copyright (C) 2003 Roshan Kamath * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ******************************************************************************/ /****************************************************************************** * BEGIN DEFINITIONS ******************************************************************************/ /* ** First the Local Defines that will be used in the program */ #define VIRAM "्" #define ZWNJ "‌" /* Zero Width Non Joiner */ #define ZWJ "‍" /* Zero Width Joiner */ /* ** Prototype Declarations */ int isVowel(int); int isConsonant(int); void addHamza(void); void addViram(void); void reduceVowel(void); void encode(char*); void convh2s(); void adjustDanda(); void correctAnusvar(); /* ** Use a "flag" to detect when to do Transliteration from English to UCS ** This global flag is set whenever we detect a tag and unset upon a */ int transliterate = 0; /* ** Use a "flag" to detect when to use the ZWNJ. This global flag is toggled ** everytime we encounter the '\'. Effectively, text enclosed within the ** '\' will not use the ZWNJ. Note that the default value is 1! */ int useNonJoiner = 1; /* ** Use a flag to detect if we should add the implicit virAms. This flag is set ** upon encountering the tag and unset upon a */ int noViram = 0; /****************************************************************************** * END DEFINITIONS ******************************************************************************/ %% /****************************************************************************** * BEGIN RULES ******************************************************************************/ /* ** Define token to Skip trailing white spaces */ [ \t]+$ /* ** Replace white spaces by a single space. Add a Hamza (') if a Vowel ** follows the white spaces */ [ \t\r]+ printf(" "); addHamza(); /* ** Detect when to start and stop transliteration. Add a Hamza (') ** if a Vowel follows */ "" transliterate = 1; addHamza(); /* Set to TRUE */ "" transliterate = 0; /* Set to FALSE */ /* ** Detect when to start and stop adding the implicit virAms. */ "" noViram = 1; /* Set to TRUE */ "" noViram = 0; /* Set to FALSE */ /* ** ECHO all HTML tags. Add a Hamza (') if a Vowel follows the tags */ \<[^\>]*\> ECHO; addHamza(); /* ** Detect when to use the ZWNJ */ \\ useNonJoiner = !useNonJoiner; /* ** Cover all punctuation marks. Update this list regularly :) */ [-~!@#$%&*\(\)_+=\{\}\[\];\"\<,?/] ECHO; addHamza(); /* ** Special Cases because UCS doesn't have fonts for these or these are ** special symbols that I use in context */ ` { if (!transliterate) ECHO; else { /* Use .a for the Glottal Stop */ addHamza(); unput('a'); unput('.'); } } w { /* Use v.d for w */ if (transliterate) { unput('d'); unput('.'); unput('v'); } else ECHO; } /* ** Begin UCS Encodings here 0x0901 to 0x097F. Note that the encode function ** uses the transliterate flag to determine whether to encode or not. This ** is also true for the other helpers like addViram(), addHamza() etc. ** ** In case of a consonant, we add a virAm (.h) in case it is not followed by ** a vowel. This can be inhibited by the tags. ** ** For a vowel, we convert any following .h to .s This is only to maintain ** backward compatibility with my personal Jtrans convention. [This has ** however been temporariy disabled.] ** ** Note: Since UCS does not have the short versions of the aE and aO vowels ** we represent the short versions by using the _ accent! To maintain ** consistency, even the short version of E and O are rendered similarly. ** ** Finally, we eliminate the 'a' vowel directly since the default glyph ** already had the vowel implicitly present. Ofcourse, this is not done ** blindly :) */ \.n encode("ँ"); /* 2306 exists, but I like to change the .N to .N^k or .N^j */ \.N correctAnusvar(); /* 2307 is the ':' But I shall use the standard ASCII version */ /* 2308 is a Hole */ 'a encode("अ"); convh2s(); 'A encode("आ"); convh2s(); 'i encode("इ"); 'I encode("ई"); convh2s(); 'u encode("उ"); 'U encode("ऊ"); convh2s(); '\.r encode("ऋ"); '\.l encode("ऌ"); /* 2317 is not used */ /* 2318 is usable, but using it causes confusion/problems */ 'e { /* Use 'E.s for 'e */ if (transliterate) { reduceVowel(); unput('E'); unput('\''); } else ECHO; } 'E encode("ए"); convh2s(); /* Use 'aE.s for 'ae */ 'ae { if (transliterate) { reduceVowel(); unput('E'); unput('a'); unput('\''); } else ECHO; } 'aE encode("ऐ"); convh2s(); /* 2321 is not used */ /* 2322 is usable, but using it causes confusion/problems */ 'o { /* Use 'O.s for 'o */ if (transliterate) { reduceVowel(); unput('O'); unput('\''); } else ECHO; } 'O encode("ओ"); convh2s(); /* Use 'aO.s for 'ao */ 'ao { if (transliterate) { reduceVowel(); unput('O'); unput('a'); unput ('\''); } else ECHO; } 'aO encode("औ"); convh2s(); k encode("क"); addViram(); kH encode("ख"); addViram(); g encode("ग"); addViram(); gH encode("घ"); addViram(); \.N^k encode("ङ"); addViram(); Ch encode("च"); addViram(); ChH encode("छ"); addViram(); j encode("ज"); addViram(); jH encode("झ"); addViram(); \.N^j encode("ञ"); addViram(); T encode("ट"); addViram(); TH encode("ठ"); addViram(); D encode("ड"); addViram(); DH encode("ढ"); addViram(); N encode("ण"); addViram(); t encode("त"); addViram(); tH encode("थ"); addViram(); d encode("द"); addViram(); dH encode("ध"); addViram(); n encode("न"); addViram(); /* 2345 is not used */ p encode("प"); addViram(); pH encode("फ"); addViram(); b encode("ब"); addViram(); bH encode("भ"); addViram(); m encode("म"); addViram(); y encode("य"); addViram(); r encode("र"); addViram(); /* 2353 is not used */ l encode("ल"); addViram(); L encode("ळ"); addViram(); /* 2356 is not used */ v encode("व"); addViram(); Sh encode("श"); addViram(); Xh encode("ष"); addViram(); s encode("स"); addViram(); h encode("ह"); addViram(); /* 2362 is a Hole */ /* 2363 is a Hole */ \.d encode("़"); addViram(); /* This is usually NEVER part of external input */ \.a encode("ऽ"); /* This is NEVER part of external input */ a { if (!transliterate) { ECHO; } else { /* Consume any 'a' symbols judiciously since they are irrelevant in UCS */ adjustDanda(); convh2s(); /* In case there is a .h following */ } } A encode("ा"); convh2s(); i encode("ि"); I encode("ी"); convh2s(); u encode("ु"); U encode("ू"); convh2s(); \.r encode("ृ"); \.R encode("ॄ"); convh2s(); /* 2373 is not used */ /* 2374 is usable, but using it causes confusion/problems */ e { /* Use E.s for e */ if (transliterate) { reduceVowel(); unput('E'); } else ECHO; } E encode("े"); convh2s(); /* Use aE.s for ae */ ae { if (transliterate) { reduceVowel(); unput('E'); unput('a'); } else ECHO; } aE encode("ै"); convh2s(); /* 2377 is not used */ /* 2378 is usable, but using it causes confusion/problems */ o { /* Use O.s for o */ if (transliterate) { reduceVowel(); unput('O'); } else ECHO; } O encode("ो"); convh2s(); /* Use aO.s for ao */ ao { if (transliterate) { reduceVowel(); unput('O'); unput('a'); } else ECHO; } aO encode("ौ"); convh2s(); \.h { if (transliterate) { printf("%s",VIRAM); /* Now add the NonJoiner only if it is not inhibited */ if (useNonJoiner) printf("%s",ZWNJ); } else ECHO; } /* 2382 is a Hole */ /* 2383 is a Hole */ /* 2384 is not used */ /* 2385 is not used */ \.s encode("॒"); /* 2387 is not used */ /* 2388 is not used */ /* 2389 is a Hole */ /* 2390 is a Hole */ /* 2391 is a Hole */ q encode("क़"); addViram(); Kh encode("ख़"); addViram(); Gh encode("घ"); encode("़"); addViram(); z encode("ज़"); addViram(); R encode("ड़"); addViram(); RH encode("ढ़"); addViram(); f encode("फ़"); addViram(); /* 2399 is not used */ '\.R encode("ॠ"); convh2s(); '\.L encode("ॡ"); convh2s(); \.l encode("ॢ"); \.L encode("ॣ"); convh2s(); \| encode("।"); \|\| encode("॥"); 0 encode("०"); 1 encode("१"); 2 encode("२"); 3 encode("३"); 4 encode("४"); 5 encode("५"); 6 encode("६"); 7 encode("७"); 8 encode("८"); 9 encode("९"); \. encode("॰"); /****************************************************************************** * END RULES ******************************************************************************/ %% /****************************************************************************** * BEGIN USER SUBROUTINES ******************************************************************************/ /* ** Define the yywrap to be an empty function */ int yywrap(void) { return 1; } /* ** This function checks if the given character is possibly the beginning of a ** dependent vowel */ int isVowel(int c) { int d; switch (c) { case 'a': case 'A': case 'i': case 'I': case 'u': case 'U': case 'e': case 'E': case 'o': case 'O': return 1; case '.': /* We need further look ahead */ d = input(); if (d >= 0) { unput(d); // Undo the lookahead switch(d) { case 'r': case 'R': case 'l': case 'L': return 1; default: return 0; } } /* Fall through */ default: return 0; } } /* ** This function checks if the following character is the beginning of a ** consonant */ int isConsonant(int c) { int d; switch(c) { case '`': case 'w': case '\'': case 'k': case 'g': case 'C': case 'j': case 'T': case 'D': case 'N': case 't': case 'd': case 'n': case 'p': case 'b': case 'm': case 'y': case 'r': case 'l': case 'L': case 'v': case 'S': case 'X': case 's': case 'h': case 'H': /* As 'H' is an implicit consonant */ case 'q': case 'K': case 'G': case 'z': case 'Z': case 'R': case 'f': return 1; case '.': /* We need further look ahead */ d = input(); if (d >= 0) { unput(d); // Undo the lookahead switch(d) { case 'N': case 'a': return 1; default: return 0; } } /* Fall through */ default: return 0; } } /* ** This function adds a Hamza (') if the following character(s) is a vowel only ** if the transliterate flag is set */ void addHamza() { if (!transliterate) return; int c = input(); // Attempt Lookahead if (c >= 0) { /* Note that the unput(c) cannot be moved out of the if block This is because the isVowel() is going to do lookahead too! */ if (isVowel(c)) { unput(c); // Undo the lookahead unput('\''); } else { unput(c); // Undo the lookahead } } } /* ** This function adds a virAm (.h) if the following character(s) is not a vowel. ** Note that a Zero-Width Non Joiner may also be inserted. All this only if the ** transliterate flag is set */ void addViram() { if (!transliterate) return; // If we are in a noViram zone, just return if (noViram) return;; int c = input(); // Attempt Lookahead if (c >= 0) { /* Note that the unput(c) cannot be moved above the if block This is because the isVowel() is going to do lookahead too! */ if (!isVowel(c)) { /* It could be a ".d" in which case the VIRAM decision is delayed to beyond the ".d" */ if (c == '.') { int d = input(); if (d >= 0) { unput(d); if (d == 'd') { unput(c); return; } } } printf("%s",VIRAM); /* Now check if the char c was a 'H' If not, put the ZWNJ */ if (c != 'H') { /* Put the ZWNJ only if the flag is set */ if (useNonJoiner) printf("%s",ZWNJ); unput(c); // Undo the lookahead } else { /* Replace this now solitary 'H' by 'h' */ unput('h'); } } else { unput(c); // Undo the lookahead } } else { printf("%s",VIRAM); // What else can we do? } } /* ** This functions adds a .s (presumably after a long vowel) to indicate a vowel ** grade reduction. If a .n follows the vowel, the .s is added after the .n */ void reduceVowel() { int dot = input(); // Look ahead for a .n if (dot >= 0) { if (dot == '.') { int n = input(); if (n >=0){ if (n == 'n') { unput('s'); unput('.'); unput(n); unput(dot); return; } unput(n); } } unput(dot); } unput('s'); unput('.'); } /* ** This function corrects the .N to one of .N^k, .N^j, N, n, and m depending ** on the following consonant */ void correctAnusvar() { if (!transliterate) return; int c = input(); // Look ahead for the following consonant; unput(c); // Restore lookahead switch (c) { case 'k': case 'g': /* Replace by .N^k */ unput('k'); unput('^'); unput('N'); unput('.'); break; case 'C': case 'j': /* Replace by .N^j */ unput('j'); unput('^'); unput('N'); unput('.'); break; case 't': case 'd': case 'n': /* Replace by n */ unput('n'); break; case 'T': case 'D': case 'N': /* Replace by N */ unput('N'); break; case 'p': case 'b': case 'm': /* Replace by m */ unput('m'); break; } } /* ** This function decides if we should retain the 'a' that was scanned already ** or not. The 'a' is removed if the following character is a consonant or '.s' ** or '.h' or '. ' or generic delimiters. Alternatively, it is retained if we ** see a vowel or a '.d'. Note, this may need to be modified based upon future ** additions since this behavior is quite arbitrary. Retaining the 'a' is only ** a way of flagging potential bugs in the input text */ void adjustDanda() { int c = input(); if (c >= 0) { if (isVowel(c)) { ECHO; // Retain the 'a' in the output } else if (c == '.') { int d = input(); // Lookahead for a d if (d >= 0) { unput(d); if (d == 'd') { ECHO; // Retain the 'a' in the output } } } unput(c); } } /* ** This function prints the incoming string to stdout if the transliterate flag ** is set. Else just send the scanned token as is to stdout */ void encode(char* utfValue) { transliterate ? printf("%s",utfValue) : ECHO; } /* ** The convh2s is a function deliberately PUT here for backward compatibility ** purposes (so that my UCS scheme is compatible with my Jtrans scheme) ** It replaces any .h following a vowel to a .s ** ** The Above was the ORIGINAL intention of using this function. Currently, ** though this is modified to just ignore any .h following a vowel. */ void convh2s() { if (!transliterate) return; int dot = input(); int h; if (dot >= 0) { if (dot == '.') { /* Look for a further 'h' */ h = input(); if (h >= 0) { if (h == 'h') { return; #if 0 unput('s'); #endif } else { unput(h); } } } unput(dot); } } /****************************************************************************** * END USER SUBROUTINES ******************************************************************************/