Source/web/WebPageSerializerImpl.cpp - chromium/blink - Git at Google

 /*
  * Copyright (C) 2009 Google Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
  *
  *     * Redistributions of source code must retain the above copyright
  * notice, this list of conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above
  * copyright notice, this list of conditions and the following disclaimer
  * in the documentation and/or other materials provided with the
  * distribution.
  *     * Neither the name of Google Inc. nor the names of its
  * contributors may be used to endorse or promote products derived from
  * this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */

 // How we handle the base tag better.
 // Current status:
 // At now the normal way we use to handling base tag is
 // a) For those links which have corresponding local saved files, such as
 // savable CSS, JavaScript files, they will be written to relative URLs which
 // point to local saved file. Why those links can not be resolved as absolute
 // file URLs, because if they are resolved as absolute URLs, after moving the
 // file location from one directory to another directory, the file URLs will
 // be dead links.
 // b) For those links which have not corresponding local saved files, such as
 // links in A, AREA tags, they will be resolved as absolute URLs.
 // c) We comment all base tags when serialzing DOM for the page.
 // FireFox also uses above way to handle base tag.
 //
 // Problem:
 // This way can not handle the following situation:
 // the base tag is written by JavaScript.
 // For example. The page "www.yahoo.com" use
 // "document.write('<base href="http://www.yahoo.com/"...');" to setup base URL
 // of page when loading page. So when saving page as completed-HTML, we assume
 // that we save "www.yahoo.com" to "c:\yahoo.htm". After then we load the saved
 // completed-HTML page, then the JavaScript will insert a base tag
 // <base href="http://www.yahoo.com/"...> to DOM, so all URLs which point to
 // local saved resource files will be resolved as
 // "http://www.yahoo.com/yahoo_files/...", which will cause all saved  resource
 // files can not be loaded correctly. Also the page will be rendered ugly since
 // all saved sub-resource files (such as CSS, JavaScript files) and sub-frame
 // files can not be fetched.
 // Now FireFox, IE and WebKit based Browser all have this problem.
 //
 // Solution:
 // My solution is that we comment old base tag and write new base tag:
 // <base href="." ...> after the previous commented base tag. In WebKit, it
 // always uses the latest "href" attribute of base tag to set document's base
 // URL. Based on this behavior, when we encounter a base tag, we comment it and
 // write a new base tag <base href="."> after the previous commented base tag.
 // The new added base tag can help engine to locate correct base URL for
 // correctly loading local saved resource files. Also I think we need to inherit
 // the base target value from document object when appending new base tag.
 // If there are multiple base tags in original document, we will comment all old
 // base tags and append new base tag after each old base tag because we do not
 // know those old base tags are original content or added by JavaScript. If
 // they are added by JavaScript, it means when loading saved page, the script(s)
 // will still insert base tag(s) to DOM, so the new added base tag(s) can
 // override the incorrect base URL and make sure we alway load correct local
 // saved resource files.

 #include "config.h"
 #include "web/WebPageSerializerImpl.h"

 #include "core/HTMLNames.h"
 #include "core/dom/Document.h"
 #include "core/dom/DocumentType.h"
 #include "core/dom/Element.h"
 #include "core/editing/serializers/Serialization.h"
 #include "core/html/HTMLAllCollection.h"
 #include "core/html/HTMLElement.h"
 #include "core/html/HTMLFormElement.h"
 #include "core/html/HTMLHtmlElement.h"
 #include "core/html/HTMLMetaElement.h"
 #include "core/loader/DocumentLoader.h"
 #include "core/loader/FrameLoader.h"
 #include "public/platform/WebVector.h"
 #include "web/WebLocalFrameImpl.h"
 #include "wtf/text/TextEncoding.h"

 namespace blink {

 // Maximum length of data buffer which is used to temporary save generated
 // html content data. This is a soft limit which might be passed if a very large
 // contegious string is found in the page.
 static const unsigned dataBufferCapacity = 65536;

 WebPageSerializerImpl::SerializeDomParam::SerializeDomParam(const KURL& url,
                                                             const WTF::TextEncoding& textEncoding,
                                                             Document* document,
                                                             const String& directoryName)
     : url(url)
     , textEncoding(textEncoding)
     , document(document)
     , directoryName(directoryName)
     , isHTMLDocument(document->isHTMLDocument())
     , haveSeenDocType(false)
     , haveAddedCharsetDeclaration(false)
     , skipMetaElement(nullptr)
     , isInScriptOrStyleTag(false)
     , haveAddedXMLProcessingDirective(false)
     , haveAddedContentsBeforeEnd(false)
 {
 }

 String WebPageSerializerImpl::preActionBeforeSerializeOpenTag(
     const Element* element, SerializeDomParam* param, bool* needSkip)
 {
     StringBuilder result;

     *needSkip = false;
     if (param->isHTMLDocument) {
         // Skip the open tag of original META tag which declare charset since we
         // have overrided the META which have correct charset declaration after
         // serializing open tag of HEAD element.
         ASSERT(element);
         if (isHTMLMetaElement(*element)) {
             const HTMLMetaElement& meta = toHTMLMetaElement(*element);
             // Check whether the META tag has declared charset or not.
             String equiv = meta.httpEquiv();
             if (equalIgnoringCase(equiv, "content-type")) {
                 String content = meta.content();
                 if (content.length() && content.contains("charset", TextCaseInsensitive)) {
                     // Find META tag declared charset, we need to skip it when
                     // serializing DOM.
                     param->skipMetaElement = element;
                     *needSkip = true;
                 }
             }
         } else if (isHTMLHtmlElement(*element)) {
             // Check something before processing the open tag of HEAD element.
             // First we add doc type declaration if original document has it.
             if (!param->haveSeenDocType) {
                 param->haveSeenDocType = true;
                 result.append(createMarkup(param->document->doctype()));
             }

             // Add MOTW declaration before html tag.
             // See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx.
             result.append(WebPageSerializer::generateMarkOfTheWebDeclaration(param->url));
         } else if (isHTMLBaseElement(*element)) {
             // Comment the BASE tag when serializing dom.
             result.appendLiteral("<!--");
         }
     } else {
         // Write XML declaration.
         if (!param->haveAddedXMLProcessingDirective) {
             param->haveAddedXMLProcessingDirective = true;
             // Get encoding info.
             String xmlEncoding = param->document->xmlEncoding();
             if (xmlEncoding.isEmpty())
                 xmlEncoding = param->document->encodingName();
             if (xmlEncoding.isEmpty())
                 xmlEncoding = UTF8Encoding().name();
             result.appendLiteral("<?xml version=\"");
             result.append(param->document->xmlVersion());
             result.appendLiteral("\" encoding=\"");
             result.append(xmlEncoding);
             if (param->document->xmlStandalone())
                 result.appendLiteral("\" standalone=\"yes");
             result.appendLiteral("\"?>\n");
         }
         // Add doc type declaration if original document has it.
         if (!param->haveSeenDocType) {
             param->haveSeenDocType = true;
             result.append(createMarkup(param->document->doctype()));
         }
     }
     return result.toString();
 }

 String WebPageSerializerImpl::postActionAfterSerializeOpenTag(
     const Element* element, SerializeDomParam* param)
 {
     StringBuilder result;

     param->haveAddedContentsBeforeEnd = false;
     if (!param->isHTMLDocument)
         return result.toString();
     // Check after processing the open tag of HEAD element
     if (!param->haveAddedCharsetDeclaration
         && isHTMLHeadElement(*element)) {
         param->haveAddedCharsetDeclaration = true;
         // Check meta element. WebKit only pre-parse the first 512 bytes
         // of the document. If the whole <HEAD> is larger and meta is the
         // end of head part, then this kind of pages aren't decoded correctly
         // because of this issue. So when we serialize the DOM, we need to
         // make sure the meta will in first child of head tag.
         // See http://bugs.webkit.org/show_bug.cgi?id=16621.
         // First we generate new content for writing correct META element.
         result.append(WebPageSerializer::generateMetaCharsetDeclaration(
             String(param->textEncoding.name())));

         param->haveAddedContentsBeforeEnd = true;
         // Will search each META which has charset declaration, and skip them all
         // in PreActionBeforeSerializeOpenTag.
     } else if (isHTMLScriptElement(*element) || isHTMLScriptElement(*element)) {
         param->isInScriptOrStyleTag = true;
     }

     return result.toString();
 }

 String WebPageSerializerImpl::preActionBeforeSerializeEndTag(
     const Element* element, SerializeDomParam* param, bool* needSkip)
 {
     String result;

     *needSkip = false;
     if (!param->isHTMLDocument)
         return result;
     // Skip the end tag of original META tag which declare charset.
     // Need not to check whether it's META tag since we guarantee
     // skipMetaElement is definitely META tag if it's not 0.
     if (param->skipMetaElement == element) {
         *needSkip = true;
     } else if (isHTMLScriptElement(*element) || isHTMLScriptElement(*element)) {
         ASSERT(param->isInScriptOrStyleTag);
         param->isInScriptOrStyleTag = false;
     }

     return result;
 }

 // After we finish serializing end tag of a element, we give the target
 // element a chance to do some post work to add some additional data.
 String WebPageSerializerImpl::postActionAfterSerializeEndTag(
     const Element* element, SerializeDomParam* param)
 {
     StringBuilder result;

     if (!param->isHTMLDocument)
         return result.toString();
     // Comment the BASE tag when serializing DOM.
     if (isHTMLBaseElement(*element)) {
         result.appendLiteral("-->");
         // Append a new base tag declaration.
         result.append(WebPageSerializer::generateBaseTagDeclaration(
             param->document->baseTarget()));
     }

     return result.toString();
 }

 void WebPageSerializerImpl::saveHTMLContentToBuffer(
     const String& result, SerializeDomParam* param)
 {
     m_dataBuffer.append(result);
     encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsNotFinished,
                          param,
                          DoNotForceFlush);
 }

 void WebPageSerializerImpl::encodeAndFlushBuffer(
     WebPageSerializerClient::PageSerializationStatus status,
     SerializeDomParam* param,
     FlushOption flushOption)
 {
     // Data buffer is not full nor do we want to force flush.
     if (flushOption != ForceFlush && m_dataBuffer.length() <= dataBufferCapacity)
         return;

     String content = m_dataBuffer.toString();
     m_dataBuffer.clear();

     CString encodedContent = param->textEncoding.normalizeAndEncode(content, WTF::EntitiesForUnencodables);

     // Send result to the client.
     m_client->didSerializeDataForFrame(param->url,
                                        WebCString(encodedContent.data(), encodedContent.length()),
                                        status);
 }

 void WebPageSerializerImpl::openTagToString(Element* element,
                                             SerializeDomParam* param)
 {
     bool needSkip;
     StringBuilder result;
     // Do pre action for open tag.
     result.append(preActionBeforeSerializeOpenTag(element, param, &needSkip));
     if (needSkip)
         return;
     // Add open tag
     result.append('<');
     result.append(element->nodeName().lower());
     // Go through all attributes and serialize them.
     AttributeCollection attributes = element->attributes();
     AttributeCollection::iterator end = attributes.end();
     for (AttributeCollection::iterator it = attributes.begin(); it != end; ++it) {
         result.append(' ');
         // Add attribute pair
         result.append(it->name().toString());
         result.appendLiteral("=\"");
         if (!it->value().isEmpty()) {
             const String& attrValue = it->value();

             // Check whether we need to replace some resource links
             // with local resource paths.
             const QualifiedName& attrName = it->name();
             if (element->hasLegalLinkAttribute(attrName)) {
                 // For links start with "javascript:", we do not change it.
                 if (attrValue.startsWith("javascript:", TextCaseInsensitive)) {
                     result.append(attrValue);
                 } else {
                     // Get the absolute link
                     WebLocalFrameImpl* subFrame = WebLocalFrameImpl::fromFrameOwnerElement(element);
                     String completeURL = subFrame ? subFrame->frame()->document()->url() :
                                                     param->document->completeURL(attrValue);
                     // Check whether we have local files for those link.
                     if (m_localLinks.contains(completeURL)) {
                         if (!param->directoryName.isEmpty()) {
                             result.appendLiteral("./");
                             result.append(param->directoryName);
                             result.append('/');
                         }
                         result.append(m_localLinks.get(completeURL));
                     } else {
                         result.append(completeURL);
                     }
                 }
             } else {
                 if (param->isHTMLDocument)
                     result.append(m_htmlEntities.convertEntitiesInString(attrValue));
                 else
                     result.append(m_xmlEntities.convertEntitiesInString(attrValue));
             }
         }
         result.append('\"');
     }

     // Do post action for open tag.
     String addedContents = postActionAfterSerializeOpenTag(element, param);
     // Complete the open tag for element when it has child/children.
     if (element->hasChildren() || param->haveAddedContentsBeforeEnd)
         result.append('>');
     // Append the added contents generate in  post action of open tag.
     result.append(addedContents);
     // Save the result to data buffer.
     saveHTMLContentToBuffer(result.toString(), param);
 }

 // Serialize end tag of an specified element.
 void WebPageSerializerImpl::endTagToString(Element* element,
                                            SerializeDomParam* param)
 {
     bool needSkip;
     StringBuilder result;
     // Do pre action for end tag.
     result.append(preActionBeforeSerializeEndTag(element, param, &needSkip));
     if (needSkip)
         return;
     // Write end tag when element has child/children.
     if (element->hasChildren() || param->haveAddedContentsBeforeEnd) {
         result.appendLiteral("</");
         result.append(element->nodeName().lower());
         result.append('>');
     } else {
         // Check whether we have to write end tag for empty element.
         if (param->isHTMLDocument) {
             result.append('>');
             // FIXME: This code is horribly wrong.  WebPageSerializerImpl must die.
             if (!element->isHTMLElement() || !toHTMLElement(element)->ieForbidsInsertHTML()) {
                 // We need to write end tag when it is required.
                 result.appendLiteral("</");
                 result.append(element->nodeName().lower());
                 result.append('>');
             }
         } else {
             // For xml base document.
             result.appendLiteral(" />");
         }
     }
     // Do post action for end tag.
     result.append(postActionAfterSerializeEndTag(element, param));
     // Save the result to data buffer.
     saveHTMLContentToBuffer(result.toString(), param);
 }

 void WebPageSerializerImpl::buildContentForNode(Node* node,
                                                 SerializeDomParam* param)
 {
     switch (node->nodeType()) {
     case Node::ELEMENT_NODE:
         // Process open tag of element.
         openTagToString(toElement(node), param);
         // Walk through the children nodes and process it.
         for (Node *child = node->firstChild(); child; child = child->nextSibling())
             buildContentForNode(child, param);
         // Process end tag of element.
         endTagToString(toElement(node), param);
         break;
     case Node::TEXT_NODE:
         saveHTMLContentToBuffer(createMarkup(node), param);
         break;
     case Node::ATTRIBUTE_NODE:
     case Node::DOCUMENT_NODE:
     case Node::DOCUMENT_FRAGMENT_NODE:
         // Should not exist.
         ASSERT_NOT_REACHED();
         break;
     // Document type node can be in DOM?
     case Node::DOCUMENT_TYPE_NODE:
         param->haveSeenDocType = true;
     default:
         // For other type node, call default action.
         saveHTMLContentToBuffer(createMarkup(node), param);
         break;
     }
 }

 WebPageSerializerImpl::WebPageSerializerImpl(WebFrame* frame,
                                              bool recursiveSerialization,
                                              WebPageSerializerClient* client,
                                              const WebVector<WebURL>& links,
                                              const WebVector<WebString>& localPaths,
                                              const WebString& localDirectoryName)
     : m_client(client)
     , m_recursiveSerialization(recursiveSerialization)
     , m_framesCollected(false)
     , m_localDirectoryName(localDirectoryName)
     , m_htmlEntities(false)
     , m_xmlEntities(true)
 {
     // Must specify available webframe.
     ASSERT(frame);
     m_specifiedWebLocalFrameImpl = toWebLocalFrameImpl(frame);
     // Make sure we have non 0 client.
     ASSERT(client);
     // Build local resources map.
     ASSERT(links.size() == localPaths.size());
     for (size_t i = 0; i < links.size(); i++) {
         KURL url = links[i];
         ASSERT(!m_localLinks.contains(url.string()));
         m_localLinks.set(url.string(), localPaths[i]);
     }

     ASSERT(m_dataBuffer.isEmpty());
 }

 void WebPageSerializerImpl::collectTargetFrames()
 {
     ASSERT(!m_framesCollected);
     m_framesCollected = true;

     // First, process main frame.
     m_frames.append(m_specifiedWebLocalFrameImpl);
     // Return now if user only needs to serialize specified frame, not including
     // all sub-frames.
     if (!m_recursiveSerialization)
         return;
     // Collect all frames inside the specified frame.
     for (WebLocalFrameImpl* frame : m_frames) {
         // Get current using document.
         Document* currentDoc = frame->frame()->document();
         // Go through sub-frames.
         RefPtrWillBeRawPtr<HTMLAllCollection> all = currentDoc->all();

         for (unsigned i = 0; Element* element = all->item(i); ++i) {
             if (!element->isHTMLElement())
                 continue;
             WebLocalFrameImpl* webFrame =
                 WebLocalFrameImpl::fromFrameOwnerElement(element);
             if (webFrame)
                 m_frames.append(webFrame);
         }
     }
 }

 bool WebPageSerializerImpl::serialize()
 {
     if (!m_framesCollected)
         collectTargetFrames();

     bool didSerialization = false;
     KURL mainURL = m_specifiedWebLocalFrameImpl->frame()->document()->url();

     for (unsigned i = 0; i < m_frames.size(); ++i) {
         WebLocalFrameImpl* webFrame = m_frames[i];
         Document* document = webFrame->frame()->document();
         const KURL& url = document->url();

         if (!url.isValid() || !m_localLinks.contains(url.string()))
             continue;

         didSerialization = true;

         const WTF::TextEncoding& textEncoding = document->encoding().isValid() ? document->encoding() : UTF8Encoding();
         String directoryName = url == mainURL ? m_localDirectoryName : "";

         SerializeDomParam param(url, textEncoding, document, directoryName);

         Element* documentElement = document->documentElement();
         if (documentElement)
             buildContentForNode(documentElement, &param);

         encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsFinished, &param, ForceFlush);
     }

     ASSERT(m_dataBuffer.isEmpty());
     m_client->didSerializeDataForFrame(KURL(), WebCString("", 0), WebPageSerializerClient::AllFramesAreFinished);
     return didSerialization;
 }

 }  // namespace blink
	/*
	* Copyright (C) 2009 Google Inc. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are
	* met:
	*
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above
	* copyright notice, this list of conditions and the following disclaimer
	* in the documentation and/or other materials provided with the
	* distribution.
	* * Neither the name of Google Inc. nor the names of its
	* contributors may be used to endorse or promote products derived from
	* this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	// How we handle the base tag better.
	// Current status:
	// At now the normal way we use to handling base tag is
	// a) For those links which have corresponding local saved files, such as
	// savable CSS, JavaScript files, they will be written to relative URLs which
	// point to local saved file. Why those links can not be resolved as absolute
	// file URLs, because if they are resolved as absolute URLs, after moving the
	// file location from one directory to another directory, the file URLs will
	// be dead links.
	// b) For those links which have not corresponding local saved files, such as
	// links in A, AREA tags, they will be resolved as absolute URLs.
	// c) We comment all base tags when serialzing DOM for the page.
	// FireFox also uses above way to handle base tag.
	//
	// Problem:
	// This way can not handle the following situation:
	// the base tag is written by JavaScript.
	// For example. The page "www.yahoo.com" use
	// "document.write('<base href="http://www.yahoo.com/"...');" to setup base URL
	// of page when loading page. So when saving page as completed-HTML, we assume
	// that we save "www.yahoo.com" to "c:\yahoo.htm". After then we load the saved
	// completed-HTML page, then the JavaScript will insert a base tag
	// <base href="http://www.yahoo.com/"...> to DOM, so all URLs which point to
	// local saved resource files will be resolved as
	// "http://www.yahoo.com/yahoo_files/...", which will cause all saved resource
	// files can not be loaded correctly. Also the page will be rendered ugly since
	// all saved sub-resource files (such as CSS, JavaScript files) and sub-frame
	// files can not be fetched.
	// Now FireFox, IE and WebKit based Browser all have this problem.
	//
	// Solution:
	// My solution is that we comment old base tag and write new base tag:
	// <base href="." ...> after the previous commented base tag. In WebKit, it
	// always uses the latest "href" attribute of base tag to set document's base
	// URL. Based on this behavior, when we encounter a base tag, we comment it and
	// write a new base tag <base href="."> after the previous commented base tag.
	// The new added base tag can help engine to locate correct base URL for
	// correctly loading local saved resource files. Also I think we need to inherit
	// the base target value from document object when appending new base tag.
	// If there are multiple base tags in original document, we will comment all old
	// base tags and append new base tag after each old base tag because we do not
	// know those old base tags are original content or added by JavaScript. If
	// they are added by JavaScript, it means when loading saved page, the script(s)
	// will still insert base tag(s) to DOM, so the new added base tag(s) can
	// override the incorrect base URL and make sure we alway load correct local
	// saved resource files.

	#include "config.h"
	#include "web/WebPageSerializerImpl.h"

	#include "core/HTMLNames.h"
	#include "core/dom/Document.h"
	#include "core/dom/DocumentType.h"
	#include "core/dom/Element.h"
	#include "core/editing/serializers/Serialization.h"
	#include "core/html/HTMLAllCollection.h"
	#include "core/html/HTMLElement.h"
	#include "core/html/HTMLFormElement.h"
	#include "core/html/HTMLHtmlElement.h"
	#include "core/html/HTMLMetaElement.h"
	#include "core/loader/DocumentLoader.h"
	#include "core/loader/FrameLoader.h"
	#include "public/platform/WebVector.h"
	#include "web/WebLocalFrameImpl.h"
	#include "wtf/text/TextEncoding.h"

	namespace blink {

	// Maximum length of data buffer which is used to temporary save generated
	// html content data. This is a soft limit which might be passed if a very large
	// contegious string is found in the page.
	static const unsigned dataBufferCapacity = 65536;

	WebPageSerializerImpl::SerializeDomParam::SerializeDomParam(const KURL& url,
	const WTF::TextEncoding& textEncoding,
	Document* document,
	const String& directoryName)
	: url(url)
	, textEncoding(textEncoding)
	, document(document)
	, directoryName(directoryName)
	, isHTMLDocument(document->isHTMLDocument())
	, haveSeenDocType(false)
	, haveAddedCharsetDeclaration(false)
	, skipMetaElement(nullptr)
	, isInScriptOrStyleTag(false)
	, haveAddedXMLProcessingDirective(false)
	, haveAddedContentsBeforeEnd(false)
	{
	}

	String WebPageSerializerImpl::preActionBeforeSerializeOpenTag(
	const Element* element, SerializeDomParam* param, bool* needSkip)
	{
	StringBuilder result;

	*needSkip = false;
	if (param->isHTMLDocument) {
	// Skip the open tag of original META tag which declare charset since we
	// have overrided the META which have correct charset declaration after
	// serializing open tag of HEAD element.
	ASSERT(element);
	if (isHTMLMetaElement(*element)) {
	const HTMLMetaElement& meta = toHTMLMetaElement(*element);
	// Check whether the META tag has declared charset or not.
	String equiv = meta.httpEquiv();
	if (equalIgnoringCase(equiv, "content-type")) {
	String content = meta.content();
	if (content.length() && content.contains("charset", TextCaseInsensitive)) {
	// Find META tag declared charset, we need to skip it when
	// serializing DOM.
	param->skipMetaElement = element;
	*needSkip = true;
	}
	}
	} else if (isHTMLHtmlElement(*element)) {
	// Check something before processing the open tag of HEAD element.
	// First we add doc type declaration if original document has it.
	if (!param->haveSeenDocType) {
	param->haveSeenDocType = true;
	result.append(createMarkup(param->document->doctype()));
	}

	// Add MOTW declaration before html tag.
	// See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx.
	result.append(WebPageSerializer::generateMarkOfTheWebDeclaration(param->url));
	} else if (isHTMLBaseElement(*element)) {
	// Comment the BASE tag when serializing dom.
	result.appendLiteral("<!--");
	}
	} else {
	// Write XML declaration.
	if (!param->haveAddedXMLProcessingDirective) {
	param->haveAddedXMLProcessingDirective = true;
	// Get encoding info.
	String xmlEncoding = param->document->xmlEncoding();
	if (xmlEncoding.isEmpty())
	xmlEncoding = param->document->encodingName();
	if (xmlEncoding.isEmpty())
	xmlEncoding = UTF8Encoding().name();
	result.appendLiteral("<?xml version=\"");
	result.append(param->document->xmlVersion());
	result.appendLiteral("\" encoding=\"");
	result.append(xmlEncoding);
	if (param->document->xmlStandalone())
	result.appendLiteral("\" standalone=\"yes");
	result.appendLiteral("\"?>\n");
	}
	// Add doc type declaration if original document has it.
	if (!param->haveSeenDocType) {
	param->haveSeenDocType = true;
	result.append(createMarkup(param->document->doctype()));
	}
	}
	return result.toString();
	}

	String WebPageSerializerImpl::postActionAfterSerializeOpenTag(
	const Element* element, SerializeDomParam* param)
	{
	StringBuilder result;

	param->haveAddedContentsBeforeEnd = false;
	if (!param->isHTMLDocument)
	return result.toString();
	// Check after processing the open tag of HEAD element
	if (!param->haveAddedCharsetDeclaration
	&& isHTMLHeadElement(*element)) {
	param->haveAddedCharsetDeclaration = true;
	// Check meta element. WebKit only pre-parse the first 512 bytes
	// of the document. If the whole <HEAD> is larger and meta is the
	// end of head part, then this kind of pages aren't decoded correctly
	// because of this issue. So when we serialize the DOM, we need to
	// make sure the meta will in first child of head tag.
	// See http://bugs.webkit.org/show_bug.cgi?id=16621.
	// First we generate new content for writing correct META element.
	result.append(WebPageSerializer::generateMetaCharsetDeclaration(
	String(param->textEncoding.name())));

	param->haveAddedContentsBeforeEnd = true;
	// Will search each META which has charset declaration, and skip them all
	// in PreActionBeforeSerializeOpenTag.
	} else if (isHTMLScriptElement(element) \|\| isHTMLScriptElement(element)) {
	param->isInScriptOrStyleTag = true;
	}

	return result.toString();
	}

	String WebPageSerializerImpl::preActionBeforeSerializeEndTag(
	const Element* element, SerializeDomParam* param, bool* needSkip)
	{
	String result;

	*needSkip = false;
	if (!param->isHTMLDocument)
	return result;
	// Skip the end tag of original META tag which declare charset.
	// Need not to check whether it's META tag since we guarantee
	// skipMetaElement is definitely META tag if it's not 0.
	if (param->skipMetaElement == element) {
	*needSkip = true;
	} else if (isHTMLScriptElement(element) \|\| isHTMLScriptElement(element)) {
	ASSERT(param->isInScriptOrStyleTag);
	param->isInScriptOrStyleTag = false;
	}

	return result;
	}

	// After we finish serializing end tag of a element, we give the target
	// element a chance to do some post work to add some additional data.
	String WebPageSerializerImpl::postActionAfterSerializeEndTag(
	const Element* element, SerializeDomParam* param)
	{
	StringBuilder result;

	if (!param->isHTMLDocument)
	return result.toString();
	// Comment the BASE tag when serializing DOM.
	if (isHTMLBaseElement(*element)) {
	result.appendLiteral("-->");
	// Append a new base tag declaration.
	result.append(WebPageSerializer::generateBaseTagDeclaration(
	param->document->baseTarget()));
	}

	return result.toString();
	}

	void WebPageSerializerImpl::saveHTMLContentToBuffer(
	const String& result, SerializeDomParam* param)
	{
	m_dataBuffer.append(result);
	encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsNotFinished,
	param,
	DoNotForceFlush);
	}

	void WebPageSerializerImpl::encodeAndFlushBuffer(
	WebPageSerializerClient::PageSerializationStatus status,
	SerializeDomParam* param,
	FlushOption flushOption)
	{
	// Data buffer is not full nor do we want to force flush.
	if (flushOption != ForceFlush && m_dataBuffer.length() <= dataBufferCapacity)
	return;

	String content = m_dataBuffer.toString();
	m_dataBuffer.clear();

	CString encodedContent = param->textEncoding.normalizeAndEncode(content, WTF::EntitiesForUnencodables);

	// Send result to the client.
	m_client->didSerializeDataForFrame(param->url,
	WebCString(encodedContent.data(), encodedContent.length()),
	status);
	}

	void WebPageSerializerImpl::openTagToString(Element* element,
	SerializeDomParam* param)
	{
	bool needSkip;
	StringBuilder result;
	// Do pre action for open tag.
	result.append(preActionBeforeSerializeOpenTag(element, param, &needSkip));
	if (needSkip)
	return;
	// Add open tag
	result.append('<');
	result.append(element->nodeName().lower());
	// Go through all attributes and serialize them.
	AttributeCollection attributes = element->attributes();
	AttributeCollection::iterator end = attributes.end();
	for (AttributeCollection::iterator it = attributes.begin(); it != end; ++it) {
	result.append(' ');
	// Add attribute pair
	result.append(it->name().toString());
	result.appendLiteral("=\"");
	if (!it->value().isEmpty()) {
	const String& attrValue = it->value();

	// Check whether we need to replace some resource links
	// with local resource paths.
	const QualifiedName& attrName = it->name();
	if (element->hasLegalLinkAttribute(attrName)) {
	// For links start with "javascript:", we do not change it.
	if (attrValue.startsWith("javascript:", TextCaseInsensitive)) {
	result.append(attrValue);
	} else {
	// Get the absolute link
	WebLocalFrameImpl* subFrame = WebLocalFrameImpl::fromFrameOwnerElement(element);
	String completeURL = subFrame ? subFrame->frame()->document()->url() :
	param->document->completeURL(attrValue);
	// Check whether we have local files for those link.
	if (m_localLinks.contains(completeURL)) {
	if (!param->directoryName.isEmpty()) {
	result.appendLiteral("./");
	result.append(param->directoryName);
	result.append('/');
	}
	result.append(m_localLinks.get(completeURL));
	} else {
	result.append(completeURL);
	}
	}
	} else {
	if (param->isHTMLDocument)
	result.append(m_htmlEntities.convertEntitiesInString(attrValue));
	else
	result.append(m_xmlEntities.convertEntitiesInString(attrValue));
	}
	}
	result.append('\"');
	}

	// Do post action for open tag.
	String addedContents = postActionAfterSerializeOpenTag(element, param);
	// Complete the open tag for element when it has child/children.
	if (element->hasChildren() \|\| param->haveAddedContentsBeforeEnd)
	result.append('>');
	// Append the added contents generate in post action of open tag.
	result.append(addedContents);
	// Save the result to data buffer.
	saveHTMLContentToBuffer(result.toString(), param);
	}

	// Serialize end tag of an specified element.
	void WebPageSerializerImpl::endTagToString(Element* element,
	SerializeDomParam* param)
	{
	bool needSkip;
	StringBuilder result;
	// Do pre action for end tag.
	result.append(preActionBeforeSerializeEndTag(element, param, &needSkip));
	if (needSkip)
	return;
	// Write end tag when element has child/children.
	if (element->hasChildren() \|\| param->haveAddedContentsBeforeEnd) {
	result.appendLiteral("</");
	result.append(element->nodeName().lower());
	result.append('>');
	} else {
	// Check whether we have to write end tag for empty element.
	if (param->isHTMLDocument) {
	result.append('>');
	// FIXME: This code is horribly wrong. WebPageSerializerImpl must die.
	if (!element->isHTMLElement() \|\| !toHTMLElement(element)->ieForbidsInsertHTML()) {
	// We need to write end tag when it is required.
	result.appendLiteral("</");
	result.append(element->nodeName().lower());
	result.append('>');
	}
	} else {
	// For xml base document.
	result.appendLiteral(" />");
	}
	}
	// Do post action for end tag.
	result.append(postActionAfterSerializeEndTag(element, param));
	// Save the result to data buffer.
	saveHTMLContentToBuffer(result.toString(), param);
	}

	void WebPageSerializerImpl::buildContentForNode(Node* node,
	SerializeDomParam* param)
	{
	switch (node->nodeType()) {
	case Node::ELEMENT_NODE:
	// Process open tag of element.
	openTagToString(toElement(node), param);
	// Walk through the children nodes and process it.
	for (Node *child = node->firstChild(); child; child = child->nextSibling())
	buildContentForNode(child, param);
	// Process end tag of element.
	endTagToString(toElement(node), param);
	break;
	case Node::TEXT_NODE:
	saveHTMLContentToBuffer(createMarkup(node), param);
	break;
	case Node::ATTRIBUTE_NODE:
	case Node::DOCUMENT_NODE:
	case Node::DOCUMENT_FRAGMENT_NODE:
	// Should not exist.
	ASSERT_NOT_REACHED();
	break;
	// Document type node can be in DOM?
	case Node::DOCUMENT_TYPE_NODE:
	param->haveSeenDocType = true;
	default:
	// For other type node, call default action.
	saveHTMLContentToBuffer(createMarkup(node), param);
	break;
	}
	}

	WebPageSerializerImpl::WebPageSerializerImpl(WebFrame* frame,
	bool recursiveSerialization,
	WebPageSerializerClient* client,
	const WebVector<WebURL>& links,
	const WebVector<WebString>& localPaths,
	const WebString& localDirectoryName)
	: m_client(client)
	, m_recursiveSerialization(recursiveSerialization)
	, m_framesCollected(false)
	, m_localDirectoryName(localDirectoryName)
	, m_htmlEntities(false)
	, m_xmlEntities(true)
	{
	// Must specify available webframe.
	ASSERT(frame);
	m_specifiedWebLocalFrameImpl = toWebLocalFrameImpl(frame);
	// Make sure we have non 0 client.
	ASSERT(client);
	// Build local resources map.
	ASSERT(links.size() == localPaths.size());
	for (size_t i = 0; i < links.size(); i++) {
	KURL url = links[i];
	ASSERT(!m_localLinks.contains(url.string()));
	m_localLinks.set(url.string(), localPaths[i]);
	}

	ASSERT(m_dataBuffer.isEmpty());
	}

	void WebPageSerializerImpl::collectTargetFrames()
	{
	ASSERT(!m_framesCollected);
	m_framesCollected = true;

	// First, process main frame.
	m_frames.append(m_specifiedWebLocalFrameImpl);
	// Return now if user only needs to serialize specified frame, not including
	// all sub-frames.
	if (!m_recursiveSerialization)
	return;
	// Collect all frames inside the specified frame.
	for (WebLocalFrameImpl* frame : m_frames) {
	// Get current using document.
	Document* currentDoc = frame->frame()->document();
	// Go through sub-frames.
	RefPtrWillBeRawPtr<HTMLAllCollection> all = currentDoc->all();

	for (unsigned i = 0; Element* element = all->item(i); ++i) {
	if (!element->isHTMLElement())
	continue;
	WebLocalFrameImpl* webFrame =
	WebLocalFrameImpl::fromFrameOwnerElement(element);
	if (webFrame)
	m_frames.append(webFrame);
	}
	}
	}

	bool WebPageSerializerImpl::serialize()
	{
	if (!m_framesCollected)
	collectTargetFrames();

	bool didSerialization = false;
	KURL mainURL = m_specifiedWebLocalFrameImpl->frame()->document()->url();

	for (unsigned i = 0; i < m_frames.size(); ++i) {
	WebLocalFrameImpl* webFrame = m_frames[i];
	Document* document = webFrame->frame()->document();
	const KURL& url = document->url();

	if (!url.isValid() \|\| !m_localLinks.contains(url.string()))
	continue;

	didSerialization = true;

	const WTF::TextEncoding& textEncoding = document->encoding().isValid() ? document->encoding() : UTF8Encoding();
	String directoryName = url == mainURL ? m_localDirectoryName : "";

	SerializeDomParam param(url, textEncoding, document, directoryName);

	Element* documentElement = document->documentElement();
	if (documentElement)
	buildContentForNode(documentElement, &param);

	encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsFinished, &param, ForceFlush);
	}

	ASSERT(m_dataBuffer.isEmpty());
	m_client->didSerializeDataForFrame(KURL(), WebCString("", 0), WebPageSerializerClient::AllFramesAreFinished);
	return didSerialization;
	}

	} // namespace blink