/* -*- Mode: c++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4; c-file-style: "stroustrup" -*-
 *
 * Copyright (C) 1995-2010 Opera Software AS.  All rights reserved.
 *
 * This file is part of the Opera web browser.
 * It may not be distributed under any circumstances.
 */

/**
 * HTML 5 parser
 */

#ifndef HTML5PARSER_H
#define HTML5PARSER_H

#include "modules/logdoc/src/html5/html5base.h"
#include "modules/logdoc/src/html5/html5treebuilder.h"
#include "modules/prefs/prefsmanager/collections/pc_parsing.h"

class HTML5Tokenizer;
class LogicalDocument;
#ifndef HTML5_STANDALONE
class ES_LoadManager;
class TextSelection;
#endif // HTML5_STANDALONE

typedef int OP_PARSER_STATUS;

class HTML5ParserStatus : public OpStatus
{
public:
	enum
	{
		NEED_MORE_DATA = USER_SUCCESS + 1,
		EXECUTE_SCRIPT = USER_SUCCESS + 2,
		FINISHED_DOC_WRITE = USER_SUCCESS + 3,
		PARSING_POSTPONED = USER_SUCCESS + 4
	};
};

/**
 * HTML5Parser is the interface between the HTML parsing code and the rest of
 * the document handling code.
 * The parser consists of a HTML5Tokenizer and a HTML5TreeBuilder and will build
 * a logical tree from the data stream according to the rules of the HTML 5 spec.
 * The tokenizer handles the markup data stream and feeds the HTML5TreeBuilder
 * with tokens for elements and nodes from that stream.
 * The tree builder processes those tokens and creates a logical tree of nodes
 * from it according to the spec.
 *
 * Parsing data from a URL:
 * 1. Feed the parser data from the URL by calling AppendDataL().
 * 2. Start the parsing by calling ParseL().
 * 3. If the parser LEAVEs with EXECUTE_SCRIPT, return control to the
 *    message loop to run some scripts.
 * 4. If the parser LEAVEs with NEED_MORE_DATA, get more data from the
 *    URL, possibly returning control to the message loop, and feed it
 *    to the parser through AppendDataL().
 * 5. If the parser LEAVEs with ERR_NO_MEMORY, we do not have enough
 *    memory available to finish the parsing, so abort the current
 *    parsing and possibly try again if we can free up some memory.
 * 6. If the parser doesn't LEAVE, the entire stream is parsed and a
 *    logical tree has been built from the stream.
 * 7. After having done the necessary steps in 3 or 4 above, continue
 *    parsing the stream where it left off by calling ContinueParsingL(),
 *    and treat the return value as in step 3, 4, 5 or 6 above.
 *
 * Writing data from script:
 * 1. Start normal parsing as in 1 above and wait for it to signal that the
 *    script that will write the data should be executed, or go directly to
 *    2 if the document is entirely generated by script.
 * 2. Feed the script data to the parser with InsertDataL().
 * 3. Start parsing with ParseL() if the document is entirely
 *    generated by script, or call ContinueParsingL() if the
 *    parser has already started processing the stream.
 * 4. If the parser LEAVEs with EXECUTE_SCRIPT, return control to the
 *    message queue to run some more scripts. Most likely the running
 *    script is blocked. It is important that the script signals back
 *    to the parser using SignalScriptFinished() when it has finished
 *    executing, or has been cancelled, so that the parser can release
 *    any blocking or pausing that may have been set.
 * 5. If the parser LEAVEs with FINISHED_DOC_WRITE, the buffer written
 *    from script is finished parsing. Tell the script that the written
 *    data is parsed and return control to the message loop to continue
 *    executing the script. If the script writes more data goto step 2.
 * 6. If the parser LEAVEs with ERR_NO_MEMORY, we do not have enough
 *    memory available to finish the parsing, so abort the current
 *    parsing and possibly try again if we can free up some memory.
 * 7. If the parser doesn't LEAVE, it usually means that the entire
 *    stream is parsed and a logical tree has been built from the stream,
 *    but in case the data came from a script, it indicates that something
 *    is wrong since it should have signaled FINISHED_DOC_WRITE :)
 * 8. After having done the necessary steps in 4 or 5 above, continue
 *    parsing the script data, or the data from the URL if all script
 *    data has been processed, where it left off by calling
 *    ContinueParsingL(), and treat the return value as in step 4, 5
 *    6 or 7 above.
 *
 * Parsing a data fragment (innerHTML) under a context element:
 * 1. Feed the string to be parsed to the parser with AppendDataL().
 * 2. Start parsing with ParseL() with the context element as
 *    one of the arguments.
 * 3. If the parser LEAVEs with ERR_NO_MEMORY, we do not have enough
 *    memory available to finish the parsing, so abort the current
 *    parsing and possibly try again if we can free up some memory.
 * 4. During fragment parsing the parser should not LEAVE with any
 *    other value since running script is deferred in that case, so
 *    if that happens it is an error.
 * 5. When the parser returns it will have parsed the fragment string
 *    to a sub-tree under the root passed to ParseL() in step 2.
 *
 *
 * Use SerializeTreeL() for serializing a logical (sub)-tree into
 * a string of markup data.
 */
class HTML5Parser
{
public:

#include "modules/logdoc/src/html5/errorcodes.h"

	HTML5Parser(LogicalDocument *logdoc)
		: m_logdoc(logdoc)
		, m_tokenizer(NULL)
		, m_first_line_number(1)
		, m_err_idx(0)
		, m_is_parsing(FALSE)
		, m_is_pausing(FALSE)
		, m_is_blocking(FALSE)
		, m_is_plaintext(FALSE)
		, m_report_errors(FALSE)
#ifdef HTML5_STANDALONE
		, m_token(NULL)
		, m_output_tokenizer_results(FALSE)
#endif // 	HTML5_STANDALONE
	{}

	~HTML5Parser();

	/**
	 * Call to parse the contents of the added buffers.
	 * @param[IN] root The document node of the tree to parse. Not NULL.
	 * @param[IN] context Will be used as the context node when parsing
	 * a fragment. NULL will mean normal, non-fragment parsing.
	 *
	 * Can Leave with OOM values or HTML5ParserStatus.
	 */
	void	ParseL(HTML5ELEMENT *root, HTML5ELEMENT *context);

	/**
	 * Non-LEAVEing version of ParseL().
	 * @param[IN] root The document node of the tree to parse. Not NULL.
	 * @param[IN] context Will be used as the context node when parsing
	 * a fragment. NULL will mean normal, non-fragment parsing.
	 * @returns Normal OOM values or HTML5ParserStatus.
	 */
	OP_PARSER_STATUS	Parse(HTML5ELEMENT *root, HTML5ELEMENT *context)
	{
		TRAPD(ret_stat, ParseL(root, context));
		return ret_stat;
	}

	/**
	 * Call to continue parsing after the parser has suspended for script.
	 * Can Leave with OOM values or HTML5ParserStatus.
	 */
	void	ContinueParsingL();

	/**
	 * Call to abort parsing or just tell the parser there is no more
	 * data coming.
	 * Can LEAVE with normal OOM values.
	 * @param abort If TRUE, the parser will not try to put the logical
	 *              tree in a well-formed state, but just stop parsing
	 *              and clean up allocated memory.
	 */
	void	StopParsingL(BOOL abort);

	/**
	 * Append data to the existing stream of data to parse.
	 * @param[IN] buffer Pointer to the start of the buffer to append. Not NULL.
	 * @param[IN] length Length, in uni_chars of the buffer.
	 * @param[IN] end_of_data Set to TRUE if this is the last data to be appended.
	 * @param[IN] is_fragment Set to TRUE if this is a fragment (innerHTML) which
	 * will result in less preprocessing.
	 */
	void	AppendDataL(const uni_char *buffer, unsigned length, BOOL end_of_data, BOOL is_fragment);

	/**
	 * Append data to the existing stream of data to parse.
	 * @param[IN] buffer Pointer to the start of the buffer to append. Not NULL.
	 * @param[IN] length Length, in uni_chars of the buffer.
	 * @param[IN] end_of_data Set to TRUE if this is the last data to be appended.
	 * @param[IN] is_fragment Set to TRUE if this is a fragment (innerHTML) which
	 * @return ERR_NO_MEMORY on OOM otherwise OK
	 * will result in less preprocessing.
	 */
	OP_STATUS	AppendData(const uni_char *buffer, unsigned length, BOOL end_of_data, BOOL is_fragment) { RETURN_IF_LEAVE(AppendDataL(buffer, length, end_of_data, is_fragment)); return OpStatus::OK; }

	/**
	 * Insert data at the current position of the existing stream of data to parse.
	 * @param[IN] buffer Pointer to the start of the buffer to append. Not NULL.
	 * @param[IN] length Length, in uni_chars of the buffer.
	 * @param[IN] add_newline Will add a newline to the inserted buffer
	 */
	void	InsertDataL(const uni_char *buffer, unsigned length, BOOL add_newline);

	/**
	 * Called when a script inserted by the parser has finished running,
	 * either because it ran to completion or because it failed.
	 * @param script_element The element of the finished script
	 * @return TRUE if the finished script was started by the parser,
	 *  otherwise FALSE.
	 */
#ifdef HTML5_STANDALONE
	BOOL	SignalScriptFinished(HTML5ELEMENT *script_element) { return m_builder.SignalScriptFinished(script_element); }
#else // HTML5_STANDALONE
	BOOL	SignalScriptFinished(HTML5ELEMENT *script_element, ES_LoadManager *loadman) { return m_builder.SignalScriptFinished(script_element, loadman, TRUE); }

	OP_STATUS	AddBlockingScript(HTML5ELEMENT *script_element) { return m_builder.AddBlockingScript(script_element); }
#endif // HTML5_STANDALONE

	/**
	 * Called when the parser has LEAVEd with FINISHED_DOC_WRITE, and control is
	 * returned to the message loop, to flush all pending text in the text
	 * accumulator. This is done so that all the pending text is available in the
	 * DOM tree if the script tries to access what it just wrote.
	 */
	void	FlushTextL() { m_builder.InsertTextL(); }

	/**
	 * Resets the internal state of the parser to the initial state. Must be called
	 * before reusing a parser that has already parsed some content.
	 */
	void	ResetParser();

	/**
	 * Gives the insertion mode for an element type. Used by the resetting of
	 * insertion mode in the tree builder and by the innerHTML code in DOM.
	 * (Placed here to be available to the DOM code)
	 * @param elm_type Element type for the (context) element.
	 * @param is_fragment Set to TRUE if the element is the context element.
	 * @returns The insertion mode to use for the element type.
	 */
	static HTML5TreeBuilder::InsertionMode
			GetInsertionModeFromElementType(HTML_ElementType elm_type, BOOL is_fragment);

#ifdef SPECULATIVE_PARSER
	/**
	 * Returns value of the current position in the stream. Since this value
	 * combines a non preprocessed position with a preprocessed one it is only
	 * comparable to a position returned from from the same function.
	 */
	unsigned 	GetStreamPosition();
#endif // SPECULATIVE_PARSER

#if defined DELAYED_SCRIPT_EXECUTION || defined SPECULATIVE_PARSER
	/**
	 * @return  Get's the uni_char offset (compared to all data added) of the start of the last 
	 *   buffer we've added to the tokenizer.  Only valid during parsing.
	 *   Returns 0 if no buffers have been added (and may also return 0 if parsing has finished).
	 */
	unsigned	GetLastBufferStartOffset();

	/**
	 * During parsing with delayed script excecution, this stores the state of the tree
	 * and parser for each delayed script element we encounter.  Used so that we can
	 * restore the state in case of ESRecover.
	 * @param state  The variable where to store the state.  Use this with RestoreParserState() later
	 * @return  ERR_OUT_OF_MEMORY on memory error, otherwise OK
	 */
	OP_STATUS	StoreParserState(HTML5ParserState* state);

	/**
	 * Used by DSE to restore the parser to the state it had when state was stored
	 * by StoreParserState().
	 * @param state  The state which was stored in the parameter to StoreParserState()
	 * @param script_element  The script element which caused us to restore
	 * @param buffer_stream_position  The position in the stream we've restored to
	 * @param script_has_completed  If TRUE script_element which caused the restore has 
	 *   actually completed running, so don't restore its script state
	 * @return  ERR_OUT_OF_MEMORY on memory error, otherwise OK
	 */
	OP_STATUS	RestoreParserState(HTML5ParserState* state, HTML5ELEMENT* script_element, unsigned buffer_stream_position, BOOL script_has_completed);
#endif // defined DELAYED_SCRIPT_EXECUTION || defined SPECULATIVE_PARSER

#ifdef DELAYED_SCRIPT_EXECUTION
	/**
	 * Used by DSE when we need to recover and restart parsing from a previous
	 * position.  This resets the parser to expect more data.
	 * @return  ERR_OUT_OF_MEMORY on memory error, otherwise OK
	 */
	OP_STATUS       Recover();

	BOOL		HasParserStateChanged(HTML5ParserState* state);

	/**
	 * Used by DSE to see if there is more data in the parser that needs to be
	 * parsed before finishing the current delayed script and continuing to
	 * the next.
	 * @returns TRUE if there is unfinished write data in the parser.
	 */
	BOOL		HasUnfinishedWriteData();

	/**
	 * During parsing, this returns the latest element inserted, except any elements
	 * which have been foster parented.  This is basically the currently last leaf
	 * of the tree still being constructed, but as the parser sees it (so doesn't take
	 * into account things like layout or scripts which may have changed the tree compared
	 * to the parser's view of it).
	 * Only returns meaningful value while parsing.
	 */
	HTML_Element*	GetLastNonFosterParentedElement() const { return m_builder.GetLastNonFosterParentedElement(); }
#endif // DELAYED_SCRIPT_EXECUTION

	class SerializeTreeOptions
	{
	public:
		SerializeTreeOptions()
			: m_text_only(FALSE),
			m_is_xml(FALSE),
			m_skip_attributes(FALSE),
			m_include_this(TRUE) {}

		SerializeTreeOptions&	TextOnly(BOOL value = TRUE) { m_text_only = value; return *this; }
		SerializeTreeOptions&	IsXml(BOOL value = TRUE) { m_is_xml = value; return *this; }
		SerializeTreeOptions&	SkipAttributes(BOOL value = TRUE) { m_skip_attributes = value; return *this; }
		SerializeTreeOptions&	IncludeRoot(BOOL value = TRUE) { m_include_this = value; return *this; }

		/// Will only add text content, as for innerText, which
	        /// means that &lt;br&gt; elements are converted to newline
	        /// characters ( la WebKit and old Operas).
		BOOL		m_text_only : 1;
		/// TRUE if the tree is to be serialized as XML.
		BOOL		m_is_xml : 1;
		/// If TRUE, attributes will not be added to the serialized tags.
		BOOL		m_skip_attributes : 1;
		/// If TRUE, will serialize the root argument and not only it's children. Not applicable for serialization of selections.
		BOOL		m_include_this : 1;
	};

	/**
	 * Serializes a subtree to a string in a buffer.
	 * @param root[in] The root element of the subtree to serialize.
	 * @param buffer[out] The buffer that will be filled with the serialized
	 *   data. Must be non-NULL.
	 * @param options @see SerializeTreeOptions.
	 */
	static void	SerializeTreeL(HTML5NODE *root, TempBuffer* buffer, const SerializeTreeOptions &options);

	/**
	 * Non-leaving version of SerializeTreeL()
	 * @See HTML5Parser::SerializeTreeL()
	 */
	static OP_STATUS	SerializeTree(HTML5NODE *root, TempBuffer* buffer, const SerializeTreeOptions &options)
	{
		TRAPD(oom_status, SerializeTreeL(root, buffer, options));
		return oom_status;
	}

#ifndef HTML5_STANDALONE
	/**
	 * Serializes a text selection to a string in a buffer. Used for rich text
	 * copying of a part of a tree.
	 * @param selection[in] The text selection to serialize.
	 * @param buffer[out] The buffer that will be filled with the serialized
	 *   data. Must be non-NULL.
	 * @param options @see SerializeTreeOptions.
	 */
	static void	SerializeSelectionL(const TextSelection &selection, TempBuffer* buffer, const SerializeTreeOptions &options);

	/**
	 * Non-leaving version of SerializeSelectionL()
	 * @See HTML5Parser::SerializeSelectionL()
	 */
	static OP_STATUS	SerializeSelection(const TextSelection &selection, TempBuffer* buffer, const SerializeTreeOptions &options)
	{
		TRAPD(oom_status, SerializeSelectionL(selection, buffer, options));
		return oom_status;
	}
#endif // !HTML5_STANDALONE

	/**
	 * Returns TRUE if the character is considered a space by HTML 5.
	 * @param c The character to check.
	 */
	static inline BOOL IsHTML5WhiteSpace(uni_char c)
	{
		// The space characters, for the purposes of this specification,
		// are U+0020 SPACE, U+0009 CHARACTER TABULATION (tab),
		// U+000A LINE FEED (LF), U+000C FORM FEED (FF), and
		// U+000D CARRIAGE RETURN (CR).

		/* 0x0d is normalized to 0x0a in the spec but not in Opera so we have to handle it as well*/
		return c == 0x20 ||c == 0x09 || c == 0x0a || c == 0x0c || c == 0x0d;
	}

	/**
	 * Used to signal to the parser that the stream passed to the parser is to
	 * be treated as plain text. Must be called before any content has been added
	 * to the logical tree and the parser has parsed any content.
	 */
	void		SetIsPlainText(BOOL is_plaintext = TRUE) { m_is_plaintext = is_plaintext; }
	BOOL		IsPlaintext() { return m_is_plaintext; }

	/** Returns the associated logical document. Never NULL */
	LogicalDocument*	GetLogicalDocument() { return m_logdoc; }
	/** Returns the current tokenizer. Can be NULL if not currently parsing. */
	HTML5Tokenizer*		GetTokenizer() const { return m_tokenizer; }
	/** Returns the associated tree builder. Never NULL */
	HTML5TreeBuilder*	GetTreeBuilder() { return &m_builder; }

	/**
	 * Called to pause the parser when the script nesting level becomes
	 * too deep, and un-pause it again when the level is back to 0.
	 * @param pause Will pause if TRUE, un-pause otherwise.
	 */
	void	SetIsPausing(BOOL pause) { m_is_pausing = pause; }
	/** Returns whether the parser is in a pause state or not */
	BOOL	IsPausing() const { return m_is_pausing; }

	/**
	 * Called to block the parser when a parser-blocking script has
	 * been added to the script queue, or unblock it again when there
	 * are no such scripts left.
	 * @param block Will block the parser if TRUE, and unblock if FALSE.
	 */
	void	SetIsBlocking(BOOL block) { m_is_blocking = block; }
	/** Returns whether the parser is blocked by script or not */
	BOOL	IsBlocking() { return m_is_blocking; }

	/** Returns TRUE if the parser is blocked or paused by script */
	BOOL	IsBlocked() { return m_is_blocking || m_is_pausing; }
	/**
	 * Signal that we cannot expect more data for this document.
	 * @param stopping Pass TRUE if we are stopping parsing.
	 */
	void	SignalNoMoreDataL(BOOL stopping);

	/** Returns TRUE if the parser has started parsing the data stream. */
	BOOL	IsParsingStarted() { return m_is_parsing; }
	/** Called when the parser has started parsing the data stream. */
	void	SetParsingStarted() { m_is_parsing = TRUE; }
	/** Returns TRUE if the parser has not started parsing or has parsed
	 * passed the last buffer in the data stream.
	 */
	BOOL	IsFinished() { return m_builder.IsFinished(); }
	/**
	 * Used to check if the parser is still parsing and the given element is still
	 * not closed so there might be more content inserted as a descendant of the element
	 * during normal parsing.
	 * @param elm The element to check if is closed or still will get more content.
	 * @returns TRUE if the parser can still insert more content under the element.
	 */
	BOOL	IsParsingUnderElm(HTML5ELEMENT *elm) { return m_builder.IsParsingUnderElm(elm); }
	/** Returns TRUE if the parser is waiting for a blocking script to finish */
	BOOL	IsWaitingForScript() { return m_builder.IsWaitingForScript(); }

	/** Returns TRUE if the parser is set up to report parsing errors */
	BOOL	ReportErrors() { return m_report_errors; }

	/**
	 * Called when a parse error is encountered.
	 * @param[in] code The code for the error type.
	 * @param[in] line The line number of the stream being parsed.
	 * @param[in] pos The position on the line of the error.
	 */
	void		SignalErrorL(ErrorCode code, unsigned line, unsigned pos);

	/** Returns the number of reported errors that has not been flushed to
	 * the error console yet.
	 */
	unsigned	GetNumberOfErrors() const { return m_err_idx; }
	/**
	 * Returns the error code and source position of error number i.
	 * @param[in] i 0-based index of error, must be less than the number returned
	 * by GetNumberOfErrors().
	 * @param[out] line Will contain the 1-based line number of the error, or 0
	 *  if the line is unknown.
	 * @param[out] pos Will contain the 1-based position on the line of the error,
	 *  or 0 if the position is unknown.
	 * @returns The error code describing the error.
	 */
	ErrorCode	GetError(unsigned i, unsigned &line, unsigned &pos) const;

	/**
	 * Returns the canonical error message for the given error code.
	 * @param code The internal error code for the error.
	 * @returns A string containing a human readable error message.
	 */
	static const char*
				GetErrorString(ErrorCode code);

	/**
	 * Returns the line number for the first line parsed.
	 */
	unsigned GetFirstLineNumber() const { return m_first_line_number; }

	/**
	 * Sets the line number for the first line parsed.
	 */
	void SetFirstLineNumber(unsigned value) { m_first_line_number = value; }

#ifdef HTML5_STANDALONE
	BOOL		GetOutputTokenizerResults() { return m_output_tokenizer_results; }
	void		SetOutputTokenizerResults(BOOL output) { m_output_tokenizer_results = output; }
#endif // 	HTML5_STANDALONE

private:

	class ErrorElm
	{
	public:
		ErrorElm() : m_code(GENERIC), m_line(0), m_pos(0) {}

		ErrorCode	m_code;
		unsigned	m_line;
		unsigned	m_pos;
	};

	LogicalDocument*	m_logdoc;
	HTML5Tokenizer*		m_tokenizer;
	HTML5TreeBuilder	m_builder;

	static const unsigned	kErrorArraySize = 256;

	unsigned	m_first_line_number;
	unsigned	m_err_idx;
	ErrorElm	m_errors[kErrorArraySize];

	BOOL		m_is_parsing;
	BOOL		m_is_pausing;
	BOOL		m_is_blocking;
	BOOL		m_is_plaintext;
	BOOL		m_report_errors;
#ifdef HTML5_STANDALONE
	HTML5Token*	m_token;
	BOOL		m_output_tokenizer_results;
#endif // 	HTML5_STANDALONE

	/// Flush errors in the queue to the error console.
	void		FlushErrorsToConsoleL();

	/// Creates a new tokenizer. Mostly used when adding data with doc write.
	void		CreateTokenizerL();

	/// Called when finished parsing to clear finished tokenizers.
	void		ClearFinishedTokenizer();
};

#endif // HTML5PARSER_H
