/******************************************************************************
 * Parser to convert roman transliterations into Unicode Devnagari
 * Copyright (C) 2003 Roshan Kamath
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 ******************************************************************************/

 /******************************************************************************
 * BEGIN DEFINITIONS
 ******************************************************************************/

 /*
 ** First the Local Defines that will be used in the program
 */
 #define VIRAM	"&#2381;"
 #define ZWNJ	"&#8204;"		/* Zero Width Non Joiner */
 #define ZWJ	"&#8205;"		/* Zero Width Joiner */

 /*
 ** Prototype Declarations
 */
 int isVowel(int);
 int isConsonant(int);
 void addHamza(void);
 void addViram(void);
 void reduceVowel(void);
 void encode(char*);
 void convh2s();
 void adjustDanda();
 void correctAnusvar();

 /*
 ** Use a "flag" to detect when to do Transliteration from English to UCS
 ** This global flag is set whenever we detect a <dn> tag and unset upon a </dn>
 */
 int transliterate = 0;

 /*
 ** Use a "flag" to detect when to use the ZWNJ. This global flag is toggled
 ** everytime we encounter the '\'. Effectively, text enclosed within the
 ** '\' will not use the ZWNJ. Note that the default value is 1!
 */
 int useNonJoiner = 1;

 /*
 ** Use a flag to detect if we should add the implicit virAms. This flag is set
 ** upon encountering the <noviram> tag and unset upon a </noviram>
 */
 int noViram = 0;


 /******************************************************************************
 * END DEFINITIONS
 ******************************************************************************/


%%
 /******************************************************************************
 * BEGIN RULES
 ******************************************************************************/

	/*
	** Define token to Skip trailing white spaces
	*/
[ \t]+$

	/*
	** Replace white spaces by a single space. Add a Hamza (') if a Vowel
	** follows the white spaces
	*/
[ \t\r]+	printf(" "); addHamza();


	/*
	** Detect when to start and stop transliteration. Add a Hamza (')
	** if a Vowel follows <dn>
	*/
"<dn>"		transliterate = 1; addHamza(); /* Set to TRUE */
"</dn>"		transliterate = 0; /* Set to FALSE */

	/*
	** Detect when to start and stop adding the implicit virAms.
	*/
"<noviram>"		noViram = 1; /* Set to TRUE */
"</noviram>"	noViram = 0; /* Set to FALSE */

	/* 
	** ECHO all HTML tags. Add a Hamza (') if a Vowel follows the tags
	*/
\<[^\>]*\>	ECHO; addHamza();

	/*
	** Detect when to use the ZWNJ
	*/
\\		useNonJoiner = !useNonJoiner;

	/*
	** Cover all punctuation marks. Update this list regularly :)
	*/
[-~!@#$%&*\(\)_+=\{\}\[\];\"\<,?/]		ECHO; addHamza();

	/*
	** Special Cases because UCS doesn't have fonts for these or these are
	** special symbols that I use in context
	*/
`	{
		if (!transliterate)
			ECHO;
		else {
			/* Use .a for the Glottal Stop */
			addHamza(); unput('a'); unput('.');
		}
	}
w		{
			/* Use v.d for w */
			if (transliterate) {
				unput('d'); unput('.');
				unput('v'); 
			} else
				ECHO;
		}

	/*
	** Begin UCS Encodings here 0x0901 to 0x097F. Note that the encode function
	** uses the transliterate flag to determine whether to encode or not. This
	** is also true for the other helpers like addViram(), addHamza() etc.
	**
	** In case of a consonant, we add a virAm (.h) in case it is not followed by
	** a vowel. This can be inhibited by the <noviram></noviram> tags.
	**
	** For a vowel, we convert any following .h to .s This is only to maintain
	** backward compatibility with my personal Jtrans convention. [This has
	** however been temporariy disabled.]
	**
	** Note: Since UCS does not have the short versions of the aE and aO vowels
	** we represent the short versions by using the _ accent! To maintain
	** consistency, even the short version of E and O are rendered similarly.
	**
	** Finally, we eliminate the 'a' vowel directly since the default glyph
	** already had the vowel implicitly present. Ofcourse, this is not done
	** blindly :)
	*/
\.n		encode("&#2305;");
		/* 2306 exists, but I like to change the .N to .N^k or .N^j */
\.N		correctAnusvar();
		/* 2307 is the ':' But I shall use the standard ASCII version */
		/* 2308 is a Hole */
'a		encode("&#2309;"); convh2s();
'A		encode("&#2310;"); convh2s();
'i		encode("&#2311;");
'I		encode("&#2312;"); convh2s();
'u		encode("&#2313;");
'U		encode("&#2314;"); convh2s();
'\.r	encode("&#2315;");
'\.l	encode("&#2316;");
		/* 2317 is not used */
		/* 2318 is usable, but using it causes confusion/problems */
'e		{	/* Use 'E.s for 'e */
			if (transliterate) {
				reduceVowel(); unput('E'); unput('\'');
			} else
				ECHO;
		}
'E		encode("&#2319;"); convh2s();
		/* Use 'aE.s for 'ae */
'ae		{
			if (transliterate) {
				reduceVowel(); unput('E'); unput('a'); unput('\'');
			} else
				ECHO;
		}
'aE		encode("&#2320;"); convh2s();
		/* 2321 is not used */
		/* 2322 is usable, but using it causes confusion/problems */
'o		{	/* Use 'O.s for 'o */
			if (transliterate) {
				reduceVowel(); unput('O'); unput('\'');
			} else
				ECHO;
		}
'O		encode("&#2323;"); convh2s();
		/* Use 'aO.s for 'ao */
'ao		{
			if (transliterate) {
				reduceVowel(); unput('O'); unput('a'); unput ('\'');
			} else
				ECHO;
		}
'aO		encode("&#2324;"); convh2s();

k		encode("&#2325;"); addViram();
kH		encode("&#2326;"); addViram();
g		encode("&#2327;"); addViram();
gH		encode("&#2328;"); addViram();
\.N^k	encode("&#2329;"); addViram();

Ch		encode("&#2330;"); addViram();
ChH		encode("&#2331;"); addViram();
j		encode("&#2332;"); addViram();
jH		encode("&#2333;"); addViram();
\.N^j	encode("&#2334;"); addViram();

T		encode("&#2335;"); addViram();
TH		encode("&#2336;"); addViram();
D		encode("&#2337;"); addViram();
DH		encode("&#2338;"); addViram();
N		encode("&#2339;"); addViram();

t		encode("&#2340;"); addViram();
tH		encode("&#2341;"); addViram();
d		encode("&#2342;"); addViram();
dH		encode("&#2343;"); addViram();
n		encode("&#2344;"); addViram();
		/* 2345 is not used */

p		encode("&#2346;"); addViram();
pH		encode("&#2347;"); addViram();
b		encode("&#2348;"); addViram();
bH		encode("&#2349;"); addViram();
m		encode("&#2350;"); addViram();

y		encode("&#2351;"); addViram();
r		encode("&#2352;"); addViram();
		/* 2353 is not used */
l		encode("&#2354;"); addViram();
L		encode("&#2355;"); addViram();
		/* 2356 is not used */
v		encode("&#2357;"); addViram();
Sh		encode("&#2358;"); addViram();
Xh		encode("&#2359;"); addViram();
s		encode("&#2360;"); addViram();
h		encode("&#2361;"); addViram();
		/* 2362 is a Hole */
		/* 2363 is a Hole */
\.d		encode("&#2364;"); addViram(); /* This is usually NEVER part of external
										  input */
\.a		encode("&#2365;");  /* This is NEVER part of external input */

a		{
			if (!transliterate) {
				ECHO; 
			} else {
				/* Consume any 'a' symbols judiciously since they are irrelevant
				   in UCS */
				adjustDanda();
				convh2s(); /* In case there is a .h following */
			}
		}
A		encode("&#2366;"); convh2s();
i		encode("&#2367;"); 
I		encode("&#2368;"); convh2s();
u		encode("&#2369;");
U		encode("&#2370;"); convh2s();
\.r		encode("&#2371;");
\.R		encode("&#2372;"); convh2s();
		/* 2373 is not used */
		/* 2374 is usable, but using it causes confusion/problems */
e		{	/* Use E.s for e */
			if (transliterate) {
				reduceVowel(); unput('E');
			} else
				ECHO;
		}
E		encode("&#2375;"); convh2s();
		/* Use aE.s for ae */
ae		{
			if (transliterate) {
				reduceVowel(); unput('E'); unput('a');
			} else
				ECHO;
		}
aE		encode("&#2376;"); convh2s();
		/* 2377 is not used */
		/* 2378 is usable, but using it causes confusion/problems */
o		{	/* Use O.s for o */
			if (transliterate) {
				reduceVowel(); unput('O');
			} else
				ECHO;
		}
O		encode("&#2379;"); convh2s();
		/* Use aO.s for ao */
ao		{
			if (transliterate) {
				reduceVowel(); unput('O'); unput('a');
			} else
				ECHO;
		}
aO		encode("&#2380;"); convh2s();
\.h		{
			if (transliterate) {
				printf("%s",VIRAM);
				/* Now add the NonJoiner only if it is not inhibited */
				if (useNonJoiner) printf("%s",ZWNJ);
			} else
				ECHO;
		}
		/* 2382 is a Hole */
		/* 2383 is a Hole */
		/* 2384 is not used */
		/* 2385 is not used */
\.s		encode("&#2386;");
		/* 2387 is not used */
		/* 2388 is not used */
		/* 2389 is a Hole */
		/* 2390 is a Hole */
		/* 2391 is a Hole */
q		encode("&#2392;"); addViram();
Kh		encode("&#2393;"); addViram();
Gh		encode("&#2328;"); encode("&#2364;"); addViram();
z		encode("&#2395;"); addViram();
R		encode("&#2396;"); addViram();
RH		encode("&#2397;"); addViram();
f		encode("&#2398;"); addViram();
		/* 2399 is not used */
'\.R	encode("&#2400;"); convh2s();
'\.L	encode("&#2401;"); convh2s();
\.l		encode("&#2402;");
\.L		encode("&#2403;"); convh2s();
\|		encode("&#2404;");
\|\|	encode("&#2405;");
0		encode("&#2406;");
1		encode("&#2407;");
2		encode("&#2408;");
3		encode("&#2409;");
4		encode("&#2410;");
5		encode("&#2411;");
6		encode("&#2412;");
7		encode("&#2413;");
8		encode("&#2414;");
9		encode("&#2415;");
\.		encode("&#2416;");

 /******************************************************************************
 * END RULES
 ******************************************************************************/


%%
 /******************************************************************************
 * BEGIN USER SUBROUTINES
 ******************************************************************************/


/*
** Define the yywrap to be an empty function
*/
int yywrap(void) { return 1; }


/*
** This function checks if the given character is possibly the beginning of a
** dependent vowel
*/
int isVowel(int c) {

	int d;

	switch (c) {
		case 'a':
		case 'A':
		case 'i':
		case 'I':
		case 'u':
		case 'U':
		case 'e':
		case 'E':
		case 'o':
		case 'O':
			return 1;

		case '.':
			/* We need further look ahead */
			d = input();
			if (d >= 0) {
				unput(d); // Undo the lookahead

				switch(d) {
					case 'r':
					case 'R':
					case 'l':
					case 'L':
						return 1;

					default:
						return 0;
				}
			}
			/* Fall through */

		default:
			return 0;
	}
}


/*
** This function checks if the following character is the beginning of a 
** consonant
*/
int isConsonant(int c) {

	int d;

	switch(c) {
		case '`':
		case 'w':
		case '\'':
		case 'k':
		case 'g':
		case 'C':
		case 'j':
		case 'T':
		case 'D':
		case 'N':
		case 't':
		case 'd':
		case 'n':
		case 'p':
		case 'b':
		case 'm':
		case 'y':
		case 'r':
		case 'l':
		case 'L':
		case 'v':
		case 'S':
		case 'X':
		case 's':
		case 'h':
		case 'H': /* As 'H' is an implicit consonant */
		case 'q':
		case 'K':
		case 'G':
		case 'z':
		case 'Z':
		case 'R':
		case 'f':
			return 1;
	
		case '.':
			/* We need further look ahead */
			d = input();
			if (d >= 0) {
				unput(d); // Undo the lookahead

				switch(d) {
					case 'N':
					case 'a':
						return 1;

					default:
						return 0;
				}
			}
			/* Fall through */

		default:
			return 0;		
	}
}


/*
** This function adds a Hamza (') if the following character(s) is a vowel only
** if the transliterate flag is set
*/
void addHamza() {
	if (!transliterate) return;

	int c = input(); // Attempt Lookahead
	if (c >= 0) {
		/* Note that the unput(c) cannot be moved out of the if block
		   This is because the isVowel() is going to do lookahead too! */
		if (isVowel(c)) {
			unput(c); // Undo the lookahead
			unput('\'');
		} else {
			unput(c); // Undo the lookahead
		}
	}
}


/*
** This function adds a virAm (.h) if the following character(s) is not a vowel.
** Note that a Zero-Width Non Joiner may also be inserted. All this only if the
** transliterate flag is set
*/
void addViram() {
	if (!transliterate) return;

	// If we are in a noViram zone, just return
	if (noViram) return;;

	int c = input(); // Attempt Lookahead

	if (c >= 0) {
		/* Note that the unput(c) cannot be moved above the if block
		   This is because the isVowel() is going to do lookahead too! */
		if (!isVowel(c)) {

			/* It could be a ".d" in which case the VIRAM decision is
			   delayed to beyond the ".d"  */
			if (c == '.') {

				int d = input();

				if (d >= 0) {
					unput(d);
					if (d == 'd') {
						unput(c);
						return;
					}
				}
			}

			printf("%s",VIRAM);

			/* Now check if the char c was a 'H'
			   If not, put the ZWNJ */
			if (c != 'H') {
				/* Put the ZWNJ only if the flag is set */
				if (useNonJoiner) printf("%s",ZWNJ);
				unput(c); // Undo the lookahead
			} else {
				/* Replace this now solitary 'H' by 'h' */
				unput('h');
			}

		} else {
			unput(c); // Undo the lookahead
		}
	} else {
		printf("%s",VIRAM); // What else can we do?
	}
}


/*
** This functions adds a .s (presumably after a long vowel) to indicate a vowel
** grade reduction. If a .n follows the vowel, the .s is added after the .n
*/
void reduceVowel() {

	int dot = input(); // Look ahead for a .n
	if (dot >= 0) {

		if (dot == '.') {

			int n = input();
			if (n >=0){

				if (n == 'n') {
					unput('s');
					unput('.');
					unput(n);
					unput(dot);
					return;
				}

				unput(n);
			}
		}
		unput(dot);
	}

	unput('s');
	unput('.');
}


/*
** This function corrects the .N to one of .N^k, .N^j, N, n, and m depending
** on the following consonant
*/
void correctAnusvar() {
	if (!transliterate) return;

	int c = input(); // Look ahead for the following consonant;
	unput(c); // Restore lookahead

	switch (c) {
		case 'k':
		case 'g':
			/* Replace by .N^k */
			unput('k');
			unput('^');
			unput('N');
			unput('.');
			break;

		case 'C':
		case 'j':
			/* Replace by .N^j */
			unput('j');
			unput('^');
			unput('N');
			unput('.');
			break;

		case 't':
		case 'd':
		case 'n':
			/* Replace by n */
			unput('n');
			break;

		case 'T':
		case 'D':
		case 'N':
			/* Replace by N */
			unput('N');
			break;

		case 'p':
		case 'b':
		case 'm':
			/* Replace by m */
			unput('m');
			break;
	}
}


/*
** This function decides if we should retain the 'a' that was scanned already
** or not. The 'a' is removed if the following character is a consonant or '.s'
** or '.h' or '. ' or generic delimiters. Alternatively, it is retained if we
** see a vowel or a '.d'. Note, this may need to be modified based upon future
** additions since this behavior is quite arbitrary. Retaining the 'a' is only 
** a way of flagging potential bugs in the input text
*/
void adjustDanda() {

	int c = input();

	if (c >= 0) {

		if (isVowel(c)) {
			ECHO;  // Retain the 'a' in the output
		} else if (c == '.') {

			int d = input(); // Lookahead for a d
			if (d >= 0) {
				unput(d);
				if (d == 'd') {
					ECHO; // Retain the 'a' in the output
				}
			}
		} 

		unput(c);
	}
}


/*
** This function prints the incoming string to stdout if the transliterate flag
** is set. Else just send the scanned token as is to stdout
*/
void encode(char* utfValue) {
	transliterate ? printf("%s",utfValue) : ECHO;
}


/*
** The convh2s is a function deliberately PUT here for backward compatibility
** purposes (so that my UCS scheme is compatible with my Jtrans scheme)
** It replaces any .h following a vowel to a .s
** 
** The Above was the ORIGINAL intention of using this function. Currently, 
** though this is modified to just ignore any .h following a vowel.
*/
void convh2s() {
	if (!transliterate) return;

	int dot = input();
	int h;

	if (dot >= 0) {
		if (dot == '.') {
			/* Look for a further 'h' */
			h = input();
			if (h >= 0) {
				if (h == 'h') {
					return;
#if 0
					unput('s');
#endif
				} else {
					unput(h);
				}
			}			
		}	
		unput(dot);
	}
}

 /******************************************************************************
 * END USER SUBROUTINES
 ******************************************************************************/