/* HtmlTokenizer.java */

/* 
 * Copyright (C) 1996-98 Mark Boyns <boyns@sdsu.edu>
 *
 * This file is part of Muffin.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
package muffin.html;

import java.io.InputStream;
import java.io.OutputStream;
import java.io.BufferedInputStream;
import java.io.PushbackInputStream;
import java.io.IOException;
import java.util.StringTokenizer;

public class HtmlTokenizer extends PushbackInputStream
{
    private int bad[] = { '>', '<', '>' };
    private int badIndex = 0;

    public HtmlTokenizer (InputStream in)
    {
	super (in);
    }

    public Token getToken () throws IOException
    {
	int ch;
	boolean quoted = false;
	int quoteChar = 0;
	Token token = new Token ();

	while ((ch = read ()) != -1)
	{
	    if (token.type != Token.TT_COMMENT)
	    {
		/* look for end quote */
		if (quoted)
		{
		    if (ch == quoteChar)
		    {
			quoted = false;
		    }
		    else if (ch == bad[badIndex])
		    {
			badIndex++;
			if (badIndex == bad.length)
			{
			    badIndex = 0;
			    quoted = false;
			    System.out.println ("HTML: Missing start or end quote");
			    System.out.println ();
			    System.out.println (new String (token.bytes));
			    System.out.println ();
			}
		    }
		}
		/* look for start tag */
		else if (ch == '<')
		{
		    if (token.type != Token.TT_NONE)
		    {
			unread (ch);
			return token;
		    }
		    token.type = Token.TT_TAG;
		}
		/* look for start quote */
		else if (token.type == Token.TT_TAG && (ch == '"' || ch == '\''))
		{
		    quoted = true;
		    quoteChar = ch;
		}
		/* otherwise it's text */
		else if (token.type == Token.TT_NONE)
		{
		    token.type = Token.TT_TEXT;
		}
	    }

	    token.append ((byte)ch);

	    /* see if the tag is really a comment */
	    if (token.type == Token.TT_TAG && token.offset == 4)
	    {
		if (token.bytes[0] == '<'
		    && token.bytes[1] == '!'
		    && token.bytes[2] == '-'
		    && token.bytes[3] == '-')
		{
		    token.type = Token.TT_COMMENT;
		}
	    }

	    /* look for end tag */
	    if (ch == '>' && !quoted && (token.type == Token.TT_TAG || token.type == Token.TT_COMMENT))
	    {
		if (token.type == Token.TT_COMMENT)
		{
		    if (token.bytes[token.offset-1] == '>'
			&& token.bytes[token.offset-2] == '-'
			&& token.bytes[token.offset-3] == '-')
		    {
			break;
		    }
		}
		else
		{
		    break;
		}
	    }
	}

	return token.type == Token.TT_NONE ? null : token;
    }
}
