/*
    BFilter - a smart ad-filtering web proxy
    Copyright (C) 2002-2006  Joseph Artsimovich <joseph_a@mail.ru>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

#include "pch.h"

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include "HtmlProcessor.h"
#include "HtmlLexerDefinitions.h"
#include "NonCopyable.h"
#include "ServiceContext.h"
#include "HtmlElementNode.h"
#include "HtmlTextNode.h"
#include "JsEnvironment.h"
#include "JsInliner.h"
#include "HtmlEscaper.h"
#include "HttpFetcher.h"
#include "RequestPtr.h"
#include "StrongPtr.h"
#include "BString.h"
#include "BStringPOD.h"
#include "SBOutStream.h"
#include "ErrorDescriptor.h"
#include "ErrorCodes.h"
#include "HeuristicScore.h"
#include "ImageHeuristicScore.h"
#include "IframeHeuristicScore.h"
#include "FlashHeuristicScore.h"
#include "WebbugHeuristicScore.h"
#include "AdSuspect.h"
#include "AdSuspectList.h"
#include "ImageAdSuspect.h"
#include "IframeAdSuspect.h"
#include "FlashAdSuspect.h"
#include "WebbugAdSuspect.h"
#include "Reactor.h"
#include "HttpRequestMetadata.h"
#include "HttpResponseMetadata.h"
#include "HttpRequestLine.h"
#include "HttpStatusLine.h"
#include "HttpHeader.h"
#include "StringUtils.h"
#include "InsensitiveEqual.h"
#include "ArraySize.h"
#include "GlobalState.h"
#include "UrlPatterns.h"
#include "Debug.h"
#include <stddef.h>
#include <limits>
#include <memory>
#include <vector>
#include <algorithm>
#include <sstream>
#include <cstring>
#include <cassert>

using namespace std;

struct HtmlProcessor::TagTypePair
{
	char const* tname;
	TagType type;
	
	operator char const*() const { return tname; }
};


struct HtmlProcessor::IPair
{
	SplittableBuffer::ByteIterator const& first;
	SplittableBuffer::ByteIterator const& second;
	
	IPair(SplittableBuffer::ByteIterator const& f,
	      SplittableBuffer::ByteIterator const& s)
	: first(f), second(s) {}
};


class HtmlProcessor::IPairIcaseComparator
{
public:
	IPairIcaseComparator() {}
private:	
	struct CStringEndPredicate
	{
		bool operator()(char const* ptr) const {
			return *ptr;
		}
	};
	
	struct IterEndPredicate
	{
		SplittableBuffer::ByteIterator const& end_;
		
		IterEndPredicate(SplittableBuffer::ByteIterator const& end) : end_(end) {}
		
		bool operator()(SplittableBuffer::ByteIterator const& iter) const {
			return iter != end_;
		}
	};
public:
	bool operator()(char const* lhs, IPair const& rhs) {
		return StringUtils::ciLess(
			lhs, CStringEndPredicate(),
			rhs.first, IterEndPredicate(rhs.second)
		);
	}
	
	bool operator()(IPair const& lhs, char const* rhs) {
		return StringUtils::ciLess(
			lhs.first, IterEndPredicate(lhs.second),
			rhs, CStringEndPredicate()
		);
	}
};

class HtmlProcessor::ScriptContext : private JsEnvironment::Listener
{
public:
	ScriptContext(JsEnvironment& env);
	
	virtual ~ScriptContext();
	
	HtmlProcessor* getSubProcessor() { return m_pSubProcessor; }
	
	void setSubProcessor(HtmlProcessor* proc) { m_pSubProcessor = proc; }
	
	int getNestLevel() const { return m_nestLevel; }
	
	void setNestLevel(int level) { m_nestLevel = level; }
	
	int getInnerHtmlLevel() const { return m_innerHtmlLevel; }
	
	void setInnerHtmlLevel(int level) { m_innerHtmlLevel = level; }
	
	int getDescendantScriptFetches() const { return m_descendantScriptFetches; }
	
	int incDescendantScriptFetches() { return ++m_descendantScriptFetches; }
	
	bool isExternalScriptInvolved() const { return m_flags & EXTERNAL; }
	
	void setExternalScriptInvolved() { m_flags |= EXTERNAL; }
	
	bool isInnerHtmlInvolved() const { return m_flags & INNER_HTML; }
	
	void setInnerHtmlInvolved() { m_flags |= INNER_HTML; }
	
	bool isRedirectInvolved() const { return m_flags & REDIRECT; }
	
	void setRedirectInvolved() { m_flags |= REDIRECT; }
	
	bool isPersistentCookieInvolved() const { return m_flags & COOKIE; }
	
	void setPersistentCookieInvolved() { m_flags |= COOKIE; }
	
	bool isCssInvolved() const { return m_flags & CSS; }
	
	void setCssInvolved() { m_flags |= CSS; }
	
	bool isOnLoadAssigned() const { return m_flags & ONLOAD_ASSIGNED; }
	
	void setOnLoadAssigned() { m_flags |= ONLOAD_ASSIGNED; }
	
	bool isUnrelatedScriptInvolved() const { return !m_unrelatedNestedScripts.empty(); }
	
	void addClickableText(size_t size) { m_clickableTextSize += size; }
	
	size_t getClickableTextSize() const { return m_clickableTextSize; }
	
	void addUnclickableText(size_t size) { m_unclickableTextSize += size; }
	
	size_t getUnclickableTextSize() const { return m_unclickableTextSize; }
	
	void incUnclickableImageCount() { ++m_unclickableImageCount; }
	
	int getUnclickableImageCount() const { return m_unclickableImageCount; }
	
	void incLocalLinkCount() { ++m_localLinkCount; }
	
	int getLocalLinkCount() const { return m_localLinkCount; }
	
	AdSuspectList& adSuspects() { return m_adSuspects; }
	
	AdSuspectList const& adSuspects() const { return m_adSuspects; }
	
	bool isWebbugTail() const { return m_webbugTailEvidence & WEBBUG_TAIL_PROVED; }
	
	bool isSubprocessorOutputDiscarded() const { return m_flags & DISCARD_SUBPROCESSOR_OUTPUT; }
	
	void discardSubprocessorOutput() { m_flags |= DISCARD_SUBPROCESSOR_OUTPUT; }
	
	SplittableBuffer& subprocessorOutput() { return m_subprocessorOutput; }
	
	SplittableBuffer const& subprocessorOutput() const { return m_subprocessorOutput; }
	
	void finishScriptOutput();
	
	void consumeSubprocessorOutput();
	
	void processNestedScriptUrl(URI const& url, URI const& page_url);
	
	bool isTooManyUnrelatedScripts() const { return m_flags & TOO_MANY_UNRELATED_SCRIPTS; }
private:
	enum {
		EXTERNAL = 1, REDIRECT = 2, COOKIE = 4,
		CSS = 8, ONLOAD_ASSIGNED = 16,
		DISCARD_SUBPROCESSOR_OUTPUT = 32,
		TOO_MANY_UNRELATED_SCRIPTS = 64,
		INNER_HTML = 128
	};
	enum {
		WEBBUG_TAIL_EV1 = 1, WEBBUG_TAIL_EV2 = 2,
		WEBBUG_TAIL_PROVED = 3
	};
	static size_t const SCRIPT_OUTPUT_LIMIT = 2048;
	
	virtual void processJsOutput(char const* data, bool newline);
	
	virtual void processOnLoadAssignment();
	
	virtual void processInnerHtmlAssignment(char const* data);
	
	int m_nestLevel;
	int m_innerHtmlLevel;
	int m_flags;
	int m_descendantScriptFetches;
	int m_unclickableImageCount;
	int m_localLinkCount;
	size_t m_clickableTextSize;
	size_t m_unclickableTextSize;
	size_t m_scriptOutputSize;
	int m_webbugTailEvidence;
	HtmlProcessor* m_pSubProcessor;
	AdSuspectList m_adSuspects;
	SplittableBuffer m_subprocessorOutput;
	size_t m_subprocessorOutputSize;
	std::vector<URI> m_unrelatedNestedScripts;
	// Urls of scripts from domains unrelated to the originating page.
	// The original (nest level = 0) script is also present here.
};


class HtmlProcessor::SubProcessor : public HtmlProcessor
{
	DECLARE_NON_COPYABLE(SubProcessor)
public:
	enum Type { DOCUMENT_WRITE, INNER_HTML };
	
	SubProcessor(HtmlProcessor& parent, ScriptContext& context, Type type);
	
	~SubProcessor();
private:
	ScriptContext& m_rContext;
	Type m_type;
	int m_oldNestLevel;
	int m_oldInnerHtmlLevel;
	HtmlProcessor* m_pOldSubProcessor;
};


class HtmlProcessor::AbstractScriptOperation
{
public:
	virtual ~AbstractScriptOperation() {};
	
	virtual void performOperation() = 0;
};


class HtmlProcessor::LeaveAsIsScriptOperation : public AbstractScriptOperation
{
public:
	LeaveAsIsScriptOperation(
		HtmlProcessor& processor, HtmlNode* node,
		SplittableBuffer const* code, BString const& comment);
	
	virtual ~LeaveAsIsScriptOperation();
	
	virtual void performOperation();
private:
	HtmlProcessor& m_rProcessor;
	HtmlNode* m_pNode;
	SplittableBuffer const* m_pCode;
	BString m_comment;
};


class HtmlProcessor::RemoveScriptOperation : public AbstractScriptOperation
{
public:
	RemoveScriptOperation(
		HtmlProcessor& processor, HtmlNode* node,
		BString const& comment);
	
	virtual ~RemoveScriptOperation();
	
	virtual void performOperation();
private:
	HtmlProcessor& m_rProcessor;
	HtmlNode* m_pNode;
	BString m_comment;
};


class HtmlProcessor::ReplaceBodyScriptOperation : public AbstractScriptOperation
{
public:
	ReplaceBodyScriptOperation(
		HtmlProcessor& processor, HtmlNode* node,
		SplittableBuffer const& new_body, BString const& comment);
	
	virtual ~ReplaceBodyScriptOperation();
	
	virtual void performOperation();
private:
	HtmlProcessor& m_rProcessor;
	HtmlNode* m_pNode;
	SplittableBuffer m_newBody;
	BString m_comment;
};


class HtmlProcessor::TempStreamScope
{
public:
	TempStreamScope(SBOutStream& strm) : m_rStream(strm) {
		assert(m_rStream.data().empty() && "TempStreamScope's can't be nested");
	}
	
	~TempStreamScope() { m_rStream.clearData(); }
private:
	SBOutStream& m_rStream;
};


struct HtmlProcessor::ScriptFetchResult
{
	ScriptFetchResult()
	: numRedirects(0), persistentCookies(false), dynamicContent(false) {}
	
	IntrusivePtr<ErrorDescriptor> errorDescriptor;
	int numRedirects;
	bool persistentCookies;
	bool dynamicContent;
};


class HtmlProcessor::PageOpenListener : public JsEnvironment::PageOpenListener
{
public:
	PageOpenListener(JsEnvironment& js_env);
	
	virtual ~PageOpenListener();
	
	virtual void processPageOpening(char const* url, char const* target);
	
	BString const& getURL() const { return m_url; }
	
	BString const& getTarget() const { return m_target; }
private:
	BString m_url;
	BString m_target;
};


SplittableBuffer const HtmlProcessor::m_sEmptyBuffer;

inline void
HtmlProcessor::outputData(std::string const& data)
{
	m_processedData << data;
}

inline void
HtmlProcessor::outputData(SplittableBuffer const& data)
{
	m_processedData.data().append(data);
}

inline void
HtmlProcessor::outputData(Iterator const& begin, Iterator const& end)
{
	m_processedData.data().append(begin, end);
}

inline int
HtmlProcessor::getScriptNestLevel() const
{
	return m_pScriptContext ? m_pScriptContext->getNestLevel() : 0;
}


HtmlProcessor::HtmlProcessor(ServiceContext& service_context)
:	m_rContext(service_context),
	m_pScriptContext(0),
	m_baseURL(BString()),
	m_contentType(CONTENT_HTML),
	m_pageHint(0),
	m_pContainingNode(&m_docFragment),
	m_openingTagType(TAG_OTHER),
	m_closingTagType(TAG_OTHER),
	m_isInsideNoscript(false),
	m_noscriptFollowsScript(false),
	m_tempStream(4000)
{
}

HtmlProcessor::HtmlProcessor(HtmlProcessor const& parent, ScriptContext& script_context)
:	m_rContext(parent.m_rContext),
	m_pScriptContext(&script_context),
	m_ptrRequestMetadata(parent.m_ptrRequestMetadata),
	m_baseURL(parent.m_baseURL),
	m_jsEnv(parent.m_jsEnv),
	m_contentType(parent.m_contentType),
	m_pageHint(parent.m_pageHint),
	m_pContainingNode(&m_docFragment),
	m_ptrContainingAnchor(parent.m_ptrContainingAnchor
		? parent.m_ptrContainingAnchor->clone() : HtmlNodePtr()),
	m_ptrContainingMap(parent.m_ptrContainingMap
		? parent.m_ptrContainingMap->clone() : HtmlNodePtr()),
	m_mapTags(parent.m_mapTags),
	m_imgTagsUsingMap(parent.m_imgTagsUsingMap),
	m_openingTagType(TAG_OTHER),
	m_closingTagType(TAG_OTHER),
	m_isInsideNoscript(parent.m_isInsideNoscript),
	m_noscriptFollowsScript(parent.m_noscriptFollowsScript),
	m_tempStream(1500) // subprocessors usually produce little output
{
}

HtmlProcessor::~HtmlProcessor()
{
}

void
HtmlProcessor::setRelatedInfo(ConstRequestPtr const& request, bool xhtml_content)
{
	m_ptrRequestMetadata = request;
	m_baseURL = request->requestLine().getURI();
	m_jsEnv.setPageURL(m_baseURL);
	m_contentType = (xhtml_content ? CONTENT_XHTML : CONTENT_HTML);
	m_pageHint = HeuristicScore::getHintModifier(m_baseURL);
}

void
HtmlProcessor::consume(SplittableBuffer& data, bool eof)
{
	HtmlLexer::consume(data, eof);
	if (eof) {
		outputPendingData();
	}
}

void
HtmlProcessor::reset()
{
	HtmlLexerCore::reset();
	m_ptrRequestMetadata.reset(0);
	BString empty_str;
	URI(empty_str).swap(m_baseURL);
	m_jsEnv.reset();
	m_contentType = CONTENT_HTML;
	m_ptrCurTag.reset(0);
	m_docFragment.removeAllChildren();
	m_pContainingNode = &m_docFragment;
	m_ptrContainingAnchor.reset(0);
	m_ptrContainingMap.reset(0);
	m_mapTags.clear();
	m_imgTagsUsingMap.clear();
	m_openingTagType = TAG_OTHER;
	m_closingTagType = TAG_OTHER;
	m_isInsideNoscript = false;
	m_noscriptFollowsScript = false;
	m_processedData.clear();
	m_tempStream.clear();
}

void
HtmlProcessor::processDocType(Iterator const& begin, Iterator const& end)
{
	processAsText(begin, end);
}

void
HtmlProcessor::processText(Iterator const& begin, Iterator const& end)
{
	if (m_pScriptContext) {
		Iterator const b(StringUtils::ltrim(begin, end));
		Iterator const e(StringUtils::rtrim(b, end));
		size_t text_len = e - b;
		if (m_ptrContainingAnchor.get()) {
			m_pScriptContext->addClickableText(text_len);
		} else {
			m_pScriptContext->addUnclickableText(text_len);
		}
	}
	processAsText(begin, end);
}

void
HtmlProcessor::processComment(Iterator const& begin, Iterator const& end)
{
	processAsText(begin, end);
}

void
HtmlProcessor::processCDATA(Iterator const& begin, Iterator const& end)
{
	processAsText(begin, end);
}

void
HtmlProcessor::processAsText(Iterator const& begin, Iterator const& end)
{
	if (m_docFragment.hasChildren()) {
		tagTextAppend(m_pContainingNode, begin, end);
	} else {
		outputData(begin, end);
	}
}

void
HtmlProcessor::processOpeningTagName(Iterator const& begin, Iterator const& end)
{
	if (m_ptrContainingAnchor && tagTerminatesAnchor(begin, end)) {
		// here is how we deal with unclosed <A> tags
		m_ptrContainingAnchor.reset(0);
	}
	
	m_openingTagType = getTagType(begin, end);
	if (m_openingTagType != TAG_OTHER) {
		BString tag_name(SplittableBuffer::toBString(begin, end));
		m_ptrCurTag = HtmlElementNode::create(StringUtils::toLowerBString(tag_name), false);
		m_ptrCurTag->setTagType(m_openingTagType);
	} else {
		m_ptrCurTag.reset(0);
	}
}

void
HtmlProcessor::processOpeningTag(Iterator const& begin, Iterator const& end, bool explicit_empty)
{
	if (!m_ptrCurTag) {
		processAsText(begin, end);
		return;
	}
	
	typedef bool (HtmlProcessor::*FuncPtr)(HtmlNode*, bool);
	static FuncPtr const func_table[] = {
		&HtmlProcessor::processOpeningOther,    // TAG_OTHER
		&HtmlProcessor::processOpeningA,        // TAG_A
		&HtmlProcessor::processOpeningArea,     // TAG_AREA
		&HtmlProcessor::processOpeningBase,     // TAG_BASE
		&HtmlProcessor::processOpeningEmbed,    // TAG_EMBED
		&HtmlProcessor::processOpeningIframe,   // TAG_IFRAME
		&HtmlProcessor::processOpeningImg,      // TAG_IMG
		&HtmlProcessor::processOpeningMap,      // TAG_MAP
		&HtmlProcessor::processOpeningNoscript, // TAG_NOSCRIPT
		&HtmlProcessor::processOpeningObject,   // TAG_OBJECT
		&HtmlProcessor::processOpeningParam,    // TAG_PARAM
		&HtmlProcessor::processOpeningScript,   // TAG_SCRIPT
		&HtmlProcessor::processOpeningOther,    // TAG_STYLE
		&HtmlProcessor::processOpeningOther     // TAG_TEXTAREA
	};
	assert(ARRAY_SIZE(func_table) == NUM_TAG_TYPES);
	bool taken_care = (this->*func_table[m_ptrCurTag->getTagType()])(m_ptrCurTag.get(), explicit_empty);
	if (!taken_care) {
		processAsText(begin, end);
	}
}

void
HtmlProcessor::processClosingTagName(Iterator const& begin, Iterator const& end)
{
	m_closingTagType = getTagType(begin, end);
}

void
HtmlProcessor::processClosingTag(Iterator const& begin, Iterator const& end, bool noscript_follows)
{
	m_noscriptFollowsScript = noscript_follows;
	
	typedef void (HtmlProcessor::*FuncPtr)(Iterator const&, Iterator const&);
	static FuncPtr const func_table[] = {
		&HtmlProcessor::processClosingOther,    // TAG_OTHER
		&HtmlProcessor::processClosingA,        // TAG_A
		&HtmlProcessor::processClosingOther,    // TAG_AREA
		&HtmlProcessor::processClosingOther,    // TAG_BASE
		&HtmlProcessor::processClosingOther,    // TAG_EMBED
		&HtmlProcessor::processClosingIframe,   // TAG_IFRAME
		&HtmlProcessor::processClosingOther,    // TAG_IMG
		&HtmlProcessor::processClosingMap,      // TAG_MAP
		&HtmlProcessor::processClosingNoscript, // TAG_NOSCRIPT
		&HtmlProcessor::processClosingObject,   // TAG_OBJECT
		&HtmlProcessor::processClosingOther,    // TAG_PARAM
		&HtmlProcessor::processClosingScript,   // TAG_SCRIPT
		&HtmlProcessor::processClosingOther,    // TAG_STYLE
		&HtmlProcessor::processClosingOther     // TAG_TEXTAREA
	};
	assert(ARRAY_SIZE(func_table) == NUM_TAG_TYPES);
	(this->*func_table[m_closingTagType])(begin, end);
}

bool
HtmlProcessor::processAttrName(Iterator const& begin, Iterator const& end)
{
	if (!(m_ptrCurTag || (m_pScriptContext &&
	     !m_pScriptContext->isSubprocessorOutputDiscarded()))) {
		return true;
	}
	BString attr_name(SplittableBuffer::toBString(begin, end));
	if (m_ptrCurTag) {
		m_curAttrName = StringUtils::toLowerBString(attr_name);
	}
	if (m_pScriptContext) {
		InsensitiveEqual ieq;
		if (ieq(attr_name, BString("id"))
		    || ieq(attr_name, BString("class"))
		    || ieq(attr_name, BString("style"))) {
			m_pScriptContext->setCssInvolved();
		}
	}
	return true;
}

void
HtmlProcessor::processAttrValue(Iterator const& begin, Iterator const& end)
{
	if (m_ptrCurTag) {
		BString value(SplittableBuffer::toBString(begin, end));
		if (StringUtils::find(value.begin(), value.end(), '"') != value.end()) {
			value = BString(StringUtils::replace("\"", "&quot;", value.toStdString()));
		}
		m_ptrCurTag->setRawAttribute(m_curAttrName, value);
	}
}

void
HtmlProcessor::processAttrNullValue()
{
	if (m_ptrCurTag) {
		m_ptrCurTag->setBooleanAttribute(m_curAttrName);
	}
}

bool
HtmlProcessor::isCDATAStarting() const
{
	static bool const table[] = {
		false, // TAG_OTHER
		false, // TAG_A
		false, // TAG_AREA
		false, // TAG_BASE
		false, // TAG_EMBED
		false, // TAG_IFRAME
		false, // TAG_IMG
		false, // TAG_MAP
		false, // TAG_NOSCRIPT
		false, // TAG_OBJECT
		false, // TAG_PARAM
		true,  // TAG_SCRIPT
		true,  // TAG_STYLE
		true   // TAG_TEXTAREA
	};
	assert(ARRAY_SIZE(table) == NUM_TAG_TYPES);
	return table[m_openingTagType];
}

bool
HtmlProcessor::isCDATAEnding(Iterator const& begin, Iterator const& end) const
{
	return getTagType(begin, end) == m_openingTagType;
}

bool
HtmlProcessor::isNoscriptToBeExpected() const
{
	/*
	If we return true here, the lexer will check a few more bytes
	to see if a <noscript> follows the current tag.
	This is needed to detect an ad in a script by discovering another
	ad in the adjacent <noscript>.
	*/
	return m_closingTagType == TAG_SCRIPT && !m_pScriptContext;
	/*
	An interesting thing is that waiting for a <noscript> in
	a subprocessor (when processing the output of a script) could
	affect the order of code execution.
	Consider the following javascript code:
	//---------------
	document.write('<script>alert(1)<'+'/script>');
	alert(2);
	document.write('<script>alert(3)<'+'/script>');
	//---------------
	In a browser the order of alerts would be 1 then 2 then 3,
	but if we'd wait for <noscript>, we'd get 2 then 1 then 3.
	I've actually seen some code that depends on this behavior.
	*/
}

bool
HtmlProcessor::isLexerInTheMiddleOfSomething() const
{
	switch (scond()) {
		case HtmlLexerDefinitions::INITIAL:
		case HtmlLexerDefinitions::TEXT:
		return false;
		default: break;
	}
	return true;
}

bool
HtmlProcessor::processOpeningOther(HtmlNode* node, bool explicit_empty)
{
	return false;
}

bool
HtmlProcessor::processOpeningA(HtmlNode* anchor, bool explicit_empty)
{
	if (!explicit_empty) {
		m_ptrContainingAnchor.reset(anchor);
	}
	if (m_ptrContainingMap) {
		m_ptrContainingMap->appendChild(anchor);
	}
	if (m_pScriptContext) {
		URI aurl(anchor->getAttribute(BString("href")));
		if (HeuristicScore::getUrlRelationship(aurl, m_baseURL)
		    != HeuristicScore::URLS_UNRELATED) {
			TempStreamScope scope(m_tempStream);
			m_tempStream << aurl.getDecodedPath()
				<< '?' << aurl.getDecodedQuery();
			if (m_tempStream.data().find(BString("http://")).isAtRightBorder()) {
				m_pScriptContext->incLocalLinkCount();
			}
		}
	}
	return false;
}

bool
HtmlProcessor::processOpeningArea(HtmlNode* area, bool explicit_empty)
{
	if (m_ptrContainingMap) {
		m_ptrContainingMap->appendChild(area);
	}
	return false;
}

bool
HtmlProcessor::processOpeningBase(HtmlNode* base, bool explicit_empty)
{
	URI url(m_baseURL, URI(base->getAttribute(BString("href"))));
	url.swap(m_baseURL);
	return false;
}

bool
HtmlProcessor::processOpeningEmbed(HtmlNode* embed, bool explicit_empty)
{
	BString const flash_mimetype("application/x-shockwave-flash");
	BString const loop_str("loop");
	BString const menu_str("menu");
	BString const true_str("true");
	BString const false_str("false");
	
	InsensitiveEqual ieq;
	
	BString type = embed->getAttribute(BString("type"));
	if (type.empty() || ieq(type, flash_mimetype)) {
		BString src = embed->getAttribute(BString("src"));
		URI url_rel(src);
		URI url(m_baseURL, url_rel);
		BString width_str = embed->getAttribute(BString("width"));
		BString height_str = embed->getAttribute(BString("height"));
		int width = StringUtils::toNumber<int>(width_str, -1);
		int height = StringUtils::toNumber<int>(height_str, -1);
		bool const loop = ieq(embed->getAttribute(loop_str), true_str);
		bool const menu = !ieq(embed->getAttribute(menu_str), false_str);
		
		FlashHeuristicScore score;
		score.processPageHint(m_pageHint);
		score.processSize(width, height);
		score.processUrl(url, url_rel, m_baseURL);
		score.processOptions(loop, menu);
		score.processNoscript(m_isInsideNoscript);
		//DEBUGLOG("[score = " << score.getNumericScore() << "] EMBED: " << url);
		if (m_pScriptContext) {
			score.processScriptLevel(
				m_pScriptContext->getNestLevel(),
				m_pScriptContext->isExternalScriptInvolved()
			);
			score.processScriptRedirect(m_pScriptContext->isRedirectInvolved());
			score.processScriptPersistentCookies(
				m_pScriptContext->isPersistentCookieInvolved()
			);
			m_pScriptContext->adSuspects().add(
				IntrusivePtr<AdSuspect>(new FlashAdSuspect(
					width, height, url, url_rel.isAbsolute(), score
				))
			);
		}
		if (score.getStatus() == HeuristicScore::AD || embed->hasAdMark()) {
			m_pContainingNode->setAdMark(true);
			BString new_url = FlashAdSuspect::getSubstitutionURL(url, width, height);
			embed->setAttribute(BString("src"), new_url);
			processOpeningTagAsText(embed, explicit_empty);
			return true;
		} else if (score.getStatus() > HeuristicScore::NOT_AD) {
			if (InsensitiveEqual()(url.getScheme(), BString("http")) &&
			    StringUtils::ltrim(src.begin(), src.end()) != src.end()) {
				BString new_url = AdSuspect::getAnalyzeURL(url, score);
				embed->setAttribute(BString("src"), new_url);
				processOpeningTagAsText(embed, explicit_empty);
				return true;
			}
		}
	}
	return false;
}

bool
HtmlProcessor::processOpeningIframe(HtmlNode* iframe, bool explicit_empty)
{
	docFragmentPutContainer(iframe);
	if (explicit_empty) {
		processClosingIframe(m_sEmptyBuffer.begin(), m_sEmptyBuffer.end());
	}
	return true;
}

bool
HtmlProcessor::processOpeningImg(HtmlNode* img, bool explicit_empty)
{
	if (explicit_empty) {
		img->setClosed(true);
	}
	return processImg(img, explicit_empty, m_ptrContainingAnchor.get());
}

bool
HtmlProcessor::processOpeningMap(HtmlNode* map, bool explicit_empty)
{
	m_ptrContainingMap.reset(map);
	m_mapTags.insert(TagsByName::value_type(map->getAttribute(BString("name")), m_ptrContainingMap));
	if (explicit_empty) {
		processClosingMap(m_sEmptyBuffer.begin(), m_sEmptyBuffer.end());
	}
	return false;
}

bool
HtmlProcessor::processOpeningNoscript(HtmlNode* noscript, bool explicit_empty)
{
	m_isInsideNoscript = true;
	if (!m_noscriptFollowsScript) {
		return false;
	}
	docFragmentPutContainer(noscript);
	if (explicit_empty) {
		processClosingNoscript(m_sEmptyBuffer.begin(), m_sEmptyBuffer.end());
	}
	return true;
}

bool
HtmlProcessor::processOpeningObject(HtmlNode* object, bool explicit_empty)
{
	if (explicit_empty) {
		return false;
	}
	docFragmentPutContainer(object);
	return true;
}

bool
HtmlProcessor::processOpeningScript(HtmlNode* script, bool explicit_empty)
{
	docFragmentPutContainer(script);
	if (explicit_empty) {
		processClosingScript(m_sEmptyBuffer.begin(), m_sEmptyBuffer.end());
	}
	return true;
}

bool
HtmlProcessor::processOpeningParam(HtmlNode* param, bool explicit_empty)
{
	if (explicit_empty) {
		param->setClosed(true);
	}
	docFragmentPutElement(param);
	return true;
}

void
HtmlProcessor::processClosingOther(Iterator const& begin, Iterator const& end)
{
	processAsText(begin, end);
}

void
HtmlProcessor::processClosingA(Iterator const& begin, Iterator const& end)
{
	m_ptrContainingAnchor.reset(0);
	processAsText(begin, end);
}

void
HtmlProcessor::processClosingIframe(Iterator const& begin, Iterator const& end)
{
	HtmlNode* iframe = findAncestorOrSelf(m_pContainingNode, BString("iframe"));
	if (!iframe) {
		processAsText(begin, end);
		return;
	}
	iframe->setClosed(true);
	processUnclosedTags(iframe);
	BString src = iframe->getAttribute(BString("src"));
	URI url_rel(src);
	URI url(m_baseURL, url_rel);
	BString width_str = iframe->getAttribute(BString("width"));
	BString height_str = iframe->getAttribute(BString("height"));
	int width = StringUtils::toNumber<int>(width_str, -1);
	int height = StringUtils::toNumber<int>(height_str, -1);
	
	IframeHeuristicScore score;
	score.processPageHint(m_pageHint);
	score.processSize(width, height);
	score.processUrl(url, url_rel, m_baseURL);
	score.processNoscript(m_isInsideNoscript);
	if (m_pScriptContext) {
		score.processScriptLevel(
			m_pScriptContext->getNestLevel(),
			m_pScriptContext->isExternalScriptInvolved()
		);
		score.processScriptRedirect(m_pScriptContext->isRedirectInvolved());
		score.processScriptPersistentCookies(
			m_pScriptContext->isPersistentCookieInvolved()
		);
		
		m_pScriptContext->adSuspects().add(
			IntrusivePtr<AdSuspect>(new IframeAdSuspect(
				width, height, url, url_rel.isAbsolute(), score
			))
		);
	}
	if (score.getStatus() == HeuristicScore::AD || iframe->hasAdMark()) {
		iframe->getParent()->setAdMark(true);
		IframeAdSuspect ad(width, height, url, url_rel.isAbsolute(), score);
		processDocFragmentNodeReplacement(iframe, *ad.getHtmlSubstitution());
		return;
	} else if (score.getStatus() > HeuristicScore::NOT_AD) {
		if (InsensitiveEqual()(url.getScheme(), BString("http")) &&
		    StringUtils::ltrim(src.begin(), src.end()) != src.end()) {
			BString new_url = AdSuspect::getAnalyzeURL(url, score);
			iframe->setAttribute(BString("src"), new_url);
		}
	}
	processDocFragmentNode(iframe);
}

void
HtmlProcessor::processClosingMap(Iterator const& begin, Iterator const& end)
{
	if (m_ptrContainingMap) {
		HtmlNode* anchor = findMostEvilMapLink(m_ptrContainingMap.get());
		typedef std::pair<MultiTagsByName::iterator, MultiTagsByName::iterator> IterPair;
		IterPair p = m_imgTagsUsingMap.equal_range(m_ptrContainingMap->getAttribute(BString("name")));
		for (MultiTagsByName::iterator iter = p.first; iter != p.second; ++iter) {
			HtmlNode* img = iter->second.get();
			if (!img->getParent()) {
				// this means it has been removed from the tree (together with its parent)
				continue;
			}
			bool handled = processImg(img, m_contentType==CONTENT_XHTML, anchor);
			if (!handled) {
				processDocFragmentNode(img);
			}
		}
		m_imgTagsUsingMap.erase(p.first, p.second);
		
		m_ptrContainingMap.reset(0);
	}
	processAsText(begin, end);
}

void
HtmlProcessor::processClosingNoscript(Iterator const& begin, Iterator const& end)
{
	m_isInsideNoscript = false;
	HtmlNode* noscript = findAncestorOrSelf(m_pContainingNode, BString("noscript"));
	if (!noscript) {
		processAsText(begin, end);
		return;
	}
	noscript->setClosed(true);
	processUnclosedTags(noscript);
	
	HtmlNode* script = noscript;
	do {
		script = script->getPrevSibling();
	} while (script && script->getTagName() != BString("script"));
	if (script) {
		if (!noscript->hasAdMark()) {
			bool nuke_noscript = false;
			processScript(script, nuke_noscript);
			if (nuke_noscript) {
				BString const comment(
					"<!--\r\n[BFilter] <noscript> nuked because of the above.\r\n-->"
				);
				processDocFragmentNodeReplacement(noscript, comment);
				return;
			}
		} else {
			BString const comment(
				"<!--\r\n[BFilter] Script removed "
				"[ad or webbug in the adjacent noscript].\r\n-->"
			);
			processDocFragmentNodeReplacement(script, comment);
			noscript->setTagName(BString("span"));
		}
	}
	
	processDocFragmentNode(noscript);
}

void
HtmlProcessor::processClosingObject(Iterator const& begin, Iterator const& end)
{
	HtmlNode* object = findAncestorOrSelf(m_pContainingNode, BString("object"));
	if (!object) {
		processAsText(begin, end);
		return;
	}
	object->setClosed(true);
	processUnclosedTags(object);
	
	BString const flash_classid("clsid:d27cdb6e-ae6d-11cf-96b8-444553540000");
	BString const flash_mimetype("application/x-shockwave-flash");
	BString const src_str("src");
	BString const data_str("data");
	BString const name_str("name");
	BString const value_str("value");
	BString const param_str("param");
	BString const loop_str("loop");
	BString const menu_str("menu");
	BString const movie_str("movie");
	BString const true_str("true");
	BString const false_str("false");
	
	InsensitiveEqual ieq;
	
	if (ieq(object->getAttribute(BString("classid")), flash_classid) ||
	    ieq(object->getAttribute(BString("type")), flash_mimetype)) {
		BString width_str = object->getAttribute(BString("width"));
		BString height_str = object->getAttribute(BString("height"));
		int width = StringUtils::toNumber<int>(width_str, -1);
		int height = StringUtils::toNumber<int>(height_str, -1);
		bool loop = false;
		bool menu = true;
		HtmlNode* movie_node = 0;
		for (HtmlNode* child = object->getFirstChild(); child; child = child->getNextSibling()) {
			if (child->getTagName() != param_str) {
				continue;
			}
			BString name = child->getAttribute(name_str);
			if (ieq(name, movie_str) || ieq(name, src_str)) {
				movie_node = child;
			} else if (ieq(name, menu_str)) {
				menu = !ieq(child->getAttribute(value_str), false_str);
			} else if (ieq(name, loop_str)) {
				loop = ieq(child->getAttribute(value_str), true_str);
			}
		}
		
		bool is_ad = false;
		
		if (object->hasAttribute(data_str)) {
			is_ad |= processFlashObject(
				object, data_str,
				width, height, loop, menu, object->hasAdMark()
			);
		}
		if (movie_node) {
			is_ad |= processFlashObject(
				movie_node, value_str,
				width, height, loop, menu, object->hasAdMark()
			);
		}
		
		if (is_ad) {
			object->getParent()->setAdMark(true);
		}
	}
	
	processDocFragmentNode(object);
}

// returns true if we are sure the object is an ad
bool
HtmlProcessor::processFlashObject(
	HtmlNode* node, BString const& attr_name,
	int width, int height, bool loop, bool menu, bool ad_mark)
{
	BString src = node->getAttribute(attr_name);
	URI url_rel(src);
	URI url(m_baseURL, url_rel);
	
	FlashHeuristicScore score;
	score.processPageHint(m_pageHint);
	score.processSize(width, height);
	score.processUrl(url, url_rel, m_baseURL);
	score.processOptions(loop, menu);
	score.processNoscript(m_isInsideNoscript);
	if (m_pScriptContext) {
		if (node->hasAdMark()) {
			node->getParent()->setAdMark(true);
			// TODO: do something with the score as well,
			// because node->getParent() may be a fragment node
		}
		score.processScriptLevel(
			m_pScriptContext->getNestLevel(),
			m_pScriptContext->isExternalScriptInvolved()
		);
		score.processScriptRedirect(m_pScriptContext->isRedirectInvolved());
		score.processScriptPersistentCookies(
			m_pScriptContext->isPersistentCookieInvolved()
		);
		m_pScriptContext->adSuspects().add(
			IntrusivePtr<AdSuspect>(new FlashAdSuspect(
				width, height, url, url_rel.isAbsolute(), score
			))
		);
	}
	
	if (score.getStatus() == HeuristicScore::AD || ad_mark) {
		BString new_url = FlashAdSuspect::getSubstitutionURL(url, width, height);
		node->setAttribute(attr_name, new_url);
		return true;
	} else if (score.getStatus() > HeuristicScore::NOT_AD) {
		if (InsensitiveEqual()(url.getScheme(), BString("http")) &&
		    StringUtils::ltrim(src.begin(), src.end()) != src.end()) {
			BString new_url = AdSuspect::getAnalyzeURL(url, score);
			node->setAttribute(attr_name, new_url);
		}
	}
	
	return false;
}

void
HtmlProcessor::processClosingScript(Iterator const& begin, Iterator const& end)
{
	HtmlNode* script = m_pContainingNode;
	if (script->getTagName() != BString("script")) {
		// this can mean one of the following:
		// 1. a </script> without the opening part
		// 2. the script tag didn't go to m_docFragment because script filtering is disabled
		processAsText(begin, end);
		return;
	}
	script->setClosed(true);
	
	if (m_noscriptFollowsScript) {
		m_pContainingNode = script->getParent();
	} else {
		bool noop;
		processScript(script, noop);
	}
}

void
HtmlProcessor::processScript(HtmlNode* script, bool& nuke_adjacent_noscript)
{
	size_t const nest_level = getScriptNestLevel();
	size_t const max_nest_level = GlobalState::ReadAccessor()->config().getMaxScriptNestLevel();
	if (nest_level < max_nest_level) {
		if (isJavaScript(script) && !isXHTML()) {
			/*
			There is no point in processing javascripts in XHTML,
			as document.write() doesn't work there. Of course it's
			theoretically possible to generate ads with DOM functions, but:
			1. BFilter doesn't support DOM.
			2. I haven't ever seen this method used to generate ads.
			*/
			processJavaScript(script, nuke_adjacent_noscript);
			return;
		}
	} else {
		DEBUGLOG("MAX_SCRIPT_NEST_LEVEL (" << max_nest_level
			<< ") reached\n" << "Page URL: " << m_baseURL);
		if (m_pScriptContext) {
			m_pScriptContext->discardSubprocessorOutput();
		}
	}
	processDocFragmentNode(script);
}

void
HtmlProcessor::processJavaScript(HtmlNode* script, bool& nuke_adjacent_noscript)
{	
	SplittableBuffer script_code;
	BString script_src;
	URI script_url(script_src);
	bool const script_external = script->hasAttribute(BString("src"));
	ScriptFetchResult fetch_res;
	if (!script_external) {
		if (script->hasChildren()) {
			// A script node has either zero or one child. Can't be more because
			// script content is CDATA (no child tags allowed) and because of
			// the way tagTextAppend() works.
			script_code = script->getFirstChild()->getContent();
		}
		//DEBUGLOG("INLINE SCRIPT: " << std::endl << script_code.toString());
	} else {
		script_src = script->getAttribute(BString("src"));
		script_url = URI(m_baseURL, URI(script_src));
		
		if (m_pScriptContext) {
			m_pScriptContext->processNestedScriptUrl(script_url, m_baseURL);
			m_pScriptContext->incDescendantScriptFetches();
			if (m_pScriptContext->isTooManyUnrelatedScripts() ||
			    m_pScriptContext->getDescendantScriptFetches() >
			    MAX_DESCENDANT_SCRIPT_FETCHES) {
				m_pScriptContext->discardSubprocessorOutput();
				processDocFragmentNode(script);
				return;
			}
		}
		
		//DEBUGLOG("Fetching external script from " << script_src);
		
		size_t max_body_size = 0;
		size_t max_fetch_size = 0;
		{
			GlobalState::ReadAccessor state;
			max_body_size = state->config().getMaxScriptEvalSize() * 1024;
			max_fetch_size = state->config().getMaxScriptFetchSize() * 1024;
		}
		
		if (max_body_size == 0 || max_fetch_size == 0) {
			//DEBUGLOG("Script fetching disabled");
			if (m_pScriptContext) {
				m_pScriptContext->discardSubprocessorOutput();
			}
			processDocFragmentNode(script);
			return;
		}
	
		
		if (m_ptrRequestMetadata->headers().hasHeader(BString("Authorization"))) {
			// Fetching a script would probably fail, so we don't even try.
			if (m_pScriptContext) {
				m_pScriptContext->discardSubprocessorOutput();
			}
			processDocFragmentNode(script);
			return;
		}
		
		fetch_res = fetchScript(
			prepareScriptFetchRequest(script_url), script_code,
			max_body_size, max_fetch_size
		);
		if (fetch_res.errorDescriptor.get()) {
			//DEBUGLOG("Script fetching failed: " << edesc->getErrorMessage());
			if (m_pScriptContext) {
				m_pScriptContext->discardSubprocessorOutput();
			}
			processDocFragmentNode(script);
			return;
		} else {
			//DEBUGLOG("Script fetching complete");
			//DEBUGLOG("EXTERNAL_SCRIPT: " << std::endl << script_code.toString());
		}
	}
	
	auto_ptr<ScriptContext> real_context;
	ScriptContext* context = m_pScriptContext;
	if (!context) {
		real_context.reset(new ScriptContext(*m_jsEnv));
		context = real_context.get();
		if (script_external) {
			context->processNestedScriptUrl(script_url, m_baseURL);
		}
	}
	if (script_external) {
		context->setExternalScriptInvolved();
		if (fetch_res.numRedirects != 0) {
			context->setRedirectInvolved();
		}
		if (fetch_res.persistentCookies) {
			context->setPersistentCookieInvolved();
		}
	}
	
	string script_src_str = script_src.toStdString();
	if (context->getSubProcessor()) {
		// enforce the correct order or script outputs
		context->consumeSubprocessorOutput(); 
	}
	executeJavaScript(*context, script, script_code, script_src_str);
	
	if (!m_pScriptContext) {
		auto_ptr<AbstractScriptOperation> operation = processJavaScriptOutput(
			*context, script, script_code, script_src_str,
			fetch_res.dynamicContent, nuke_adjacent_noscript
		);
		operation->performOperation();
	} else {
		// we still have to remove the <script> element from the tree.
		RemoveScriptOperation operation(*this, script, BString());
		operation.performOperation();
	}
}

void
HtmlProcessor::executeJavaScript(
	ScriptContext& context, HtmlNode* script,
	SplittableBuffer const& script_code, string const& script_src)
{
	SubProcessor subprocessor(*this, context, SubProcessor::DOCUMENT_WRITE);
	
	static char const js_str[] = { 'j','a','v','a','s','c','r','i','p','t' };
	BString language = script->getAttribute(BString("language"));
	string js_version;
	if (StringUtils::ciStartsWith(
	    language.begin(), language.end(), js_str, js_str + sizeof(js_str))) {
		language.trimFront(sizeof(js_str));
		js_version = language.toStdString();
	}
	
	m_jsEnv->executeScript(script_code.toBString(), script_src.c_str(), js_version.c_str());
	context.finishScriptOutput();
}

auto_ptr<HtmlProcessor::AbstractScriptOperation>
HtmlProcessor::processJavaScriptOutput(
	ScriptContext& context, HtmlNode* script,
	SplittableBuffer const& script_code, string const& script_src,
	bool dynamic_content, bool& nuke_adjacent_noscript)
{
	auto_ptr<AbstractScriptOperation> res;
	
	//DEBUGLOG("SCRIPT OUTPUT: " << context.subprocessorOutput().toBString());
	
	do { // just to be able to break from it
		if (context.isUnrelatedScriptInvolved() &&
		    context.isOnLoadAssigned()) {
			BString comment("<!--\r\n[BFilter] Script removed "
				"[unrelated script sets onload handler].\r\n-->");
			res.reset(new RemoveScriptOperation(*this, script, comment));
			break;
		}
		
		if (context.isTooManyUnrelatedScripts()) {
			BString comment("<!--\r\n[BFilter] Script removed "
				"[too many unrelated nested scripts].\r\n-->");
			res.reset(new RemoveScriptOperation(*this, script, comment));
			break;
		}
		
		if (context.isWebbugTail()) {
			BString comment("<!--\r\n[BFilter] Script removed [webbug tail].\r\n-->");
			res.reset(new RemoveScriptOperation(*this, script, comment));
			break;
		}
		
		if (context.getUnclickableTextSize() > 200 && context.getClickableTextSize() < 200) {
			//DEBUGLOG("Script " << script_src << " generates a lot of text");
			break;
		} else if (context.getUnclickableImageCount() > 1) {
			//DEBUGLOG("Script " << script_src << " generates > 1 unclickable images");
			break;
		} else if (context.getLocalLinkCount() > 2) {
			//DEBUGLOG("Script " << script_src << " generates > 2 local links"); 
			break;
		}
		
		AdSuspect const* suspect = context.adSuspects().findMainSuspect();
		if (!suspect) {
			break;
		}
		//DEBUGLOG("Suspect score = " << suspect->getScore().getNumericScore()
		//	<< ", location: " << suspect->getLocation());
		
		HeuristicScore::Status const status = suspect->getScore().getStatus();
		
		if (status == HeuristicScore::PROBABLY_AD ||
		    status == HeuristicScore::PROBABLY_NOT_AD) {
			res = tryMarkScriptForAnalyzing(
				context, *suspect, script, script_code, script_src
			);
			if (res.get()) {
				break;
			}
		}
		
		if (status == HeuristicScore::AD ||
		    status == HeuristicScore::PROBABLY_AD) {
			if (status == HeuristicScore::AD) {
				script->getParent()->setAdMark(true);
			}
			res = substituteAdScript(
				context, *suspect, script,
				script_code, script_src, nuke_adjacent_noscript
			);
		}
	} while (false);
	
	if (!res.get()) {
		/*
		In case the script is external, we may do one of the following:
		1. Inline the script, since we have already fetched it.
		2. Leave it as is.
		
		We have to consider the following points:
		a) Inlining a script could improve performance, especially if
		   it's not cacheable.
		b) Some scripts look for themselves in the DOM (using the "src" property)
		   to find the relative path to itself and to load other scripts from the
		   same directory. In this case, inlinig a script would prevent it from
		   working properly.
		c) An advertising engine may choose not to generate an ad under some conditions.
		   For example, a certain engine produces empty (actually not empty but no-op)
		   scripts from time to time.  Not inlining such a script would probably mean
		   a missed ad.
		
		Having considered the above points, I came up with the following:
		
		if (script is external) {
			if (script is dynamically generated) {
				inline it;
			} else {
				leave it as is;
			}
		}
		
		Currently we determine whether the script is dynamically generated
		by the absence of Last-Modified response header.
		*/
		
		BString comment;
		SplittableBuffer const* inline_code = 0;
		
		if (!script_src.empty() && dynamic_content) {
			inline_code = &script_code;
			BString const prefix("<!--\r\n[BFilter] The following script has been fetched from \"");
			BString const suffix("\"\r\n-->");
			
			TempStreamScope scope(m_tempStream);
			m_tempStream << prefix << script_src << suffix;
			comment = m_tempStream.data().toBString();
		}
		res.reset(new LeaveAsIsScriptOperation(*this, script, inline_code, comment));
	}
	
	return res;
}

/*
Marking a script for analyzing is a matter of replacing the intances
of ad host (which we already know) with ad_host/bf-analyze
*/
auto_ptr<HtmlProcessor::AbstractScriptOperation>
HtmlProcessor::tryMarkScriptForAnalyzing(
	ScriptContext& context, AdSuspect const& suspect, HtmlNode* script,
	SplittableBuffer const& script_code, string const& script_src)
{
	auto_ptr<AbstractScriptOperation> res;
	
	if (context.isInnerHtmlInvolved()) {
		return res;
	}
	
	if (!context.adSuspects().isSameHost()) {
		// Better avoid such complex cases,
		// although theoretically it might work.
		return res;
	}
	
	int num_marks = 0;
	SplittableBuffer new_body;
	if (suspect.isLocationAbsolute() &&
		InsensitiveEqual()(suspect.getLocation().getScheme(), BString("http"))) {
		num_marks = context.adSuspects().markForAnalyzing(
			script_code, new_body, HeuristicScore::PROBABLY_NOT_AD
		);
	}
	if (num_marks == 0) {
		return res;
	}
	
	BString comment;
	if (script_src.empty()) {
		comment = BString(
			"<!--\r\n[BFilter] The following script has been modified.\r\n-->"
		);
	} else {
		BString const prefix(
			"<!--\r\n[BFilter] The following script has been fetched from \""
		);
		BString const suffix("\" and modified.\r\n-->");
		
		TempStreamScope scope(m_tempStream);
		m_tempStream << prefix << script_src << suffix;
		comment = m_tempStream.data().toBString();
	}
	
	res.reset(new ReplaceBodyScriptOperation(*this, script, new_body, comment));
	return res;
}

auto_ptr<HtmlProcessor::AbstractScriptOperation>
HtmlProcessor::substituteAdScript(
	ScriptContext& context, AdSuspect const& suspect, HtmlNode* script,
	SplittableBuffer const& script_code, string const& script_src,
	bool& nuke_adjacent_noscript)
{
	auto_ptr<AbstractScriptOperation> res;
	
	if (context.isCssInvolved() &&
	    !context.isInnerHtmlInvolved() &&
	    !context.isSubprocessorOutputDiscarded()) {
		/*
		Consider a case where a script outputs the following:
		<div id="x"><a ...><img ...></a></div>
		Now imagine that there is a css rule that makes
		element x absolute positioned. In this case it's better
		to keep the output structure, or our ad replacement will
		appear in a wrong place, possibly covering other content.
		*/
		BString const prefix(
			"<!--\r\n[BFilter] Script has been "
			"replaced with its (modified) output: \r\n-->"
		);
		BString const suffix(
			"<!--\r\n[BFilter] Script output end.\r\n-->"
		);
		SplittableBuffer data;
		data.appendDestructive(context.subprocessorOutput());
		data.prepend(prefix);
		data.append(suffix);
		res.reset(new RemoveScriptOperation(
			*this, script, data.toBString()
		));
		nuke_adjacent_noscript = true;
	} else {
		res = createScriptSubstitution(script, suspect);
	}
	
	return res;
}

bool
HtmlProcessor::processImg(HtmlNode* img, bool explicit_empty, HtmlNode const* anchor)
{
	BString src = img->getAttribute(BString("src"));
	URI imgurl_rel(src);
	URI imgurl(m_baseURL, imgurl_rel);
	BString width_str = img->getAttribute(BString("width"));
	BString height_str = img->getAttribute(BString("height"));
	int width = StringUtils::toNumber<int>(width_str, -1);
	int height = StringUtils::toNumber<int>(height_str, -1);
	bool webbug_size = (width >= 0 && width <= 2 && height >= 0 && height <= 2);
	if (webbug_size) {
		WebbugHeuristicScore score;
		score.processPageHint(m_pageHint);
		score.processImageUrl(imgurl, imgurl_rel, m_baseURL);
		score.processNoscript(m_isInsideNoscript);
		if (m_pScriptContext) {
			score.processScriptLevel(
				m_pScriptContext->getNestLevel(),
				m_pScriptContext->isExternalScriptInvolved()
			);
			score.processScriptRedirect(m_pScriptContext->isRedirectInvolved());
			score.processScriptPersistentCookies(
				m_pScriptContext->isPersistentCookieInvolved()
			);
			
			m_pScriptContext->adSuspects().add(
				IntrusivePtr<AdSuspect>(new WebbugAdSuspect(
					imgurl, imgurl_rel.isAbsolute(), score
				))
			);
		}
		if (score.getStatus() == HeuristicScore::AD) {
			BString subst_url = WebbugAdSuspect::getSubstitutionURL(imgurl);
			img->setAttribute(BString("src"), subst_url);
			if (img->getParent()) {
				img->getParent()->setAdMark(true);
				processDocFragmentNode(img);
			} else {
				m_pContainingNode->setAdMark(true);
				processOpeningTagAsText(img, explicit_empty);
			}
			return true;
		}
	}
	if (!anchor && !img->getParent()) {
		BString const usemap_str("usemap");
		if (img->hasAttribute(usemap_str)) {
			BString mapname = img->getAttribute(usemap_str);
			if (!mapname.empty() && mapname[0] == '#') {
				mapname.trimFront(1);
			}
			TagsByName::iterator it = m_mapTags.find(mapname);
			if (it != m_mapTags.end()) {
				anchor = findMostEvilMapLink(it->second.get());
			} else {
				docFragmentPutElement(img);
				m_imgTagsUsingMap.insert(MultiTagsByName::value_type(mapname, HtmlNodePtr(img)));
				return true;
			}
		}
	}
	if (!anchor) {
		if (m_pScriptContext && !webbug_size) {
			m_pScriptContext->incUnclickableImageCount();
		}
	} else {
		BString anchor_link;
		BString anchor_target;
		
		BString onclick = anchor->getAttribute(BString("onclick"));
		if (!onclick.empty()) {
			PageOpenListener listener(*m_jsEnv);
			m_jsEnv->executeScriptAsFunction(onclick, "<onclick>", 1);
			anchor_link = listener.getURL();
			anchor_target = listener.getTarget();
		}
		
		if (anchor_link.empty()) {
			anchor_link = anchor->getAttribute(BString("href"));
			anchor_target = anchor->getAttribute(BString("target"));
		}
		
		URI aurl_rel(anchor_link);
		URI aurl(m_baseURL, aurl_rel);
		
		ImageHeuristicScore score;
		score.processPageHint(m_pageHint);
		score.processSize(width, height);
		score.processUrls(m_baseURL, imgurl, imgurl_rel, aurl, aurl_rel);
		score.processLinkTarget(anchor_target);
		score.processAssociatedMap(img->hasAttribute(BString("usemap")));
		score.processNoscript(m_isInsideNoscript);
		//DEBUGLOG("imgurl = " << imgurl);
		//DEBUGLOG("score = " << score.getNumericScore());
		if (m_pScriptContext) {
			score.processScriptLevel(
				m_pScriptContext->getNestLevel(),
				m_pScriptContext->isExternalScriptInvolved()
			);
			score.processScriptRedirect(m_pScriptContext->isRedirectInvolved());
			score.processScriptPersistentCookies(
				m_pScriptContext->isPersistentCookieInvolved()
			);
			
			m_pScriptContext->adSuspects().add(
				IntrusivePtr<AdSuspect>(new ImageAdSuspect(
					width, height, imgurl, imgurl_rel.isAbsolute(),
					aurl.toBString(), score
				))
			);
		}
		if (score.getStatus() == HeuristicScore::AD) {
			BString new_url = ImageAdSuspect::getSubstitutionURL(imgurl, width, height);
			img->setAttribute(BString("src"), new_url);
			if (img->getParent()) {
				img->getParent()->setAdMark(true);
				processDocFragmentNode(img);
			} else {
				m_pContainingNode->setAdMark(true);
				processOpeningTagAsText(img, explicit_empty);
			}
			return true;
		} else if (score.getStatus() > HeuristicScore::NOT_AD) {
			if (InsensitiveEqual()(imgurl.getScheme(), BString("http")) &&
			    StringUtils::ltrim(src.begin(), src.end()) != src.end()) {
				BString new_url = AdSuspect::getAnalyzeURL(imgurl, score);
				img->setAttribute(BString("src"), new_url);
				if (img->getParent()) {
					processDocFragmentNode(img);
				} else {
					processOpeningTagAsText(img, explicit_empty);
				}
				return true;
			}
		}
	} // if (anchor)
	
	return false;
}

void
HtmlProcessor::processUnclosedTags(HtmlNode* to)
{
	while (m_pContainingNode != to) {
		HtmlNode* containing_node = m_pContainingNode;
		m_closingTagType = containing_node->getTagType();
		processClosingTag(m_sEmptyBuffer.begin(), m_sEmptyBuffer.end());
		assert(m_pContainingNode != containing_node && "Element wan't removed from the tree");
	}
}

void
HtmlProcessor::processOpeningTagAsText(HtmlNode* node, bool explicit_empty)
{
	assert(!node->getParent());
	
	if (m_docFragment.hasChildren()) {
		TempStreamScope scope(m_tempStream);
		openingTagToStream(m_tempStream, node, m_contentType, explicit_empty);
		SplittableBuffer const& data = m_tempStream.data();
		tagTextAppend(m_pContainingNode, data.begin(), data.end());
	} else {
		openingTagToStream(m_processedData, node, m_contentType, explicit_empty);
	}
}

void
HtmlProcessor::processIndependentNode(HtmlNode* node)
{
	assert(!node->getParent());
	
	if (m_docFragment.hasChildren()) {
		docFragmentPutElement(node);
		processNode(node, false, false);
	} else {
		HtmlFragmentNode parent;
		parent.appendChild(node);
		processNode(node, false, true);
	}
}

void
HtmlProcessor::processDocFragmentNode(HtmlNode* node)
{
	assert(node->getParent());
	
	if (isAncestorOrSelf(node, m_pContainingNode)) {
		m_pContainingNode = node->getParent();
	}
	processNode(node, true, m_docFragment.getFirstChild() == node);
	outputDocFragmentLeadingText();
}

void
HtmlProcessor::processDocFragmentNodeReplacement(HtmlNode* original, HtmlNode* replacement)
{
	assert(original->getParent());
	
	if (isAncestorOrSelf(original, m_pContainingNode)) {
		m_pContainingNode = original->getParent();
	}
	original->insertPrevSibling(replacement);
	original->detachFromParent();
	processNode(replacement, true, m_docFragment.getFirstChild() == replacement);
	outputDocFragmentLeadingText();
}

void
HtmlProcessor::processDocFragmentNodeReplacement(HtmlNode* node, BString const& replacement)
{
	HtmlNodePtr text_node(HtmlTextNode::create());
	text_node->appendContent(replacement);
	processDocFragmentNodeReplacement(node, text_node.get());
}

void
HtmlProcessor::processDocFragmentNodeReplacement(HtmlNode* node, SplittableBuffer const& replacement)
{
	HtmlNodePtr text_node(HtmlTextNode::create());
	text_node->appendContent(replacement.begin(), replacement.end());
	processDocFragmentNodeReplacement(node, text_node.get());
}

// this method destroys the node (removes its children and detaches the node from its parent)
bool
HtmlProcessor::processNode(HtmlNode* node, bool skip_stoppers, bool can_output_directly, bool toplevel)
{
	assert(node->getParent());
	
	if (node->getNodeType() == HtmlNode::TEXT_NODE) {
		if (can_output_directly) {
			outputData(node->getContent());
			node->detachFromParent();
		}
		return can_output_directly;
	}
	if (!toplevel && skip_stoppers && isStopperTag(node->getTagType())) {
		return false;
	}
	
	bool explicit_empty = true; // true will prevent outputing a closing tag for a fragment node
	if (node->getNodeType() == HtmlNode::ELEMENT_NODE) {
		explicit_empty = isXHTML() && !node->hasChildren() && isClosingTagForbidden(node->getTagName());
		if (can_output_directly) {
			openingTagToStream(m_processedData, node, m_contentType, explicit_empty);
		} else {
			TempStreamScope scope(m_tempStream);
			openingTagToStream(m_tempStream, node, m_contentType, explicit_empty);
			HtmlNodePtr text_node(HtmlTextNode::create());
			SplittableBuffer const& data = m_tempStream.data();
			text_node->appendContent(data.begin(), data.end());
			node->insertPrevSibling(text_node);
		}
	}
	
	for (HtmlNodePtr child(node->getFirstChild()); child; ) {
		HtmlNode* next_child = child->getNextSibling();
		child->detachFromParent();
		node->insertPrevSibling(child);
		can_output_directly = processNode(child.get(), skip_stoppers, can_output_directly, false);
		child.reset(next_child);
	}
	
	if (!explicit_empty && node->isClosed()) {
		BString const& tag_name = node->getTagName();
		std::string ctag;
		ctag.reserve(tag_name.size()+3);
		ctag += "</";
		ctag.append(tag_name.begin(), tag_name.end());
		ctag += '>';
		if (can_output_directly) {
			outputData(ctag);
		} else {
			HtmlNodePtr text_node(HtmlTextNode::create());
			text_node->appendContent(BString(ctag));
			node->insertPrevSibling(text_node);
		}
	}
	
	node->detachFromParent();
	return can_output_directly;
}

void
HtmlProcessor::docFragmentPutContainer(HtmlNode* node)
{
	m_pContainingNode->appendChild(node);
	m_pContainingNode = node;
}

void
HtmlProcessor::docFragmentPutElement(HtmlNode* node)
{
	m_pContainingNode->appendChild(node);
}

void
HtmlProcessor::inlineExternalScript(HtmlNode* node, SplittableBuffer const& code)
{
	SplittableBuffer::ByteIterator const begin(code.begin());
	SplittableBuffer::ByteIterator const end(code.end());
	HtmlNodePtr text_node(HtmlTextNode::create());
	
	node->removeAttribute(BString("src"));
	node->removeAllChildren();
	node->appendChild(text_node.get());
	
	SplittableBuffer inlined;
	JsInliner::inlineJavaScript(
		code, inlined,
		isXHTML() ? JsInliner::XHTML : JsInliner::HTML
	);
	text_node->setContent(inlined);
}

bool
HtmlProcessor::isJavaScript(HtmlNode const* script)
{
	static char const js_str[] = { 'j','a','v','a','s','c','r','i','p','t' };
	static char const js_mimetype[] = { 't','e','x','t','/','j','a','v','a','s','c','r','i','p','t' };
	
	BString const type_str("type");
	BString const language_str("language");
	
	if (script->hasAttribute(type_str)) {
		BString const& type = script->getAttribute(type_str);
		return StringUtils::ciEqual(type.begin(), type.end(),
			js_mimetype, js_mimetype + sizeof(js_mimetype));
	} else if (script->hasAttribute(language_str)) {
		BString const& language = script->getAttribute(language_str);
		return StringUtils::ciStartsWith(language.begin(), language.end(),
			js_str, js_str + sizeof(js_str));
	}
	return true;
}

bool
HtmlProcessor::isAncestorOrSelf(HtmlNode const* ancestor, HtmlNode const* descendant)
{
	for (; descendant; descendant = descendant->getParent()) {
		if (descendant == ancestor) {
			return true;
		}
	}
	return false;
}

HtmlNode*
HtmlProcessor::findAncestorOrSelf(HtmlNode* node, BString const& name)
{
	for (; node; node = node->getParent()) {
		if (node->getTagName() == name) {
			break;
		}
	}
	return node;
}

HtmlNode*
HtmlProcessor::findMostEvilMapLink(HtmlNode* map)
{
	HtmlNode* most_evil_link = 0;
	int top_score = numeric_limits<int>::min();
	for (HtmlNode* node = map->getFirstChild(); node; node = node->getNextSibling()) {
		ImageHeuristicScore score;
		URI link_rel(BString(node->getAttribute(BString("href"))));
		URI link(m_baseURL, link_rel);
		score.processUrls(m_baseURL, link, link_rel, link, link_rel);
		score.processLinkTarget(node->getAttribute(BString("target")));
		if (score.getNumericScore() >= top_score) {
			top_score = score.getNumericScore();
			most_evil_link = node;
		}
	}
	return most_evil_link;
}

void
HtmlProcessor::outputDocFragmentLeadingText()
{
	HtmlNode* node;
	while ((node = m_docFragment.getFirstChild()) && node->getNodeType() == HtmlNode::TEXT_NODE) {
		outputData(node->getContent());
		node->detachFromParent();
	}
}

void
HtmlProcessor::outputPendingData()
{
	for (HtmlNode* child; (child = m_docFragment.getFirstChild()); ) {
		processNode(child, false, true);
	}
	m_pContainingNode = &m_docFragment;
}

void
HtmlProcessor::tagTextAppend(HtmlNode* el, Iterator const& begin, Iterator const& end)
{
	HtmlNode* last_child = el->getLastChild();
	if (last_child && last_child->getNodeType() == HtmlNode::TEXT_NODE) {
		last_child->appendContent(begin, end);
	} else {
		HtmlNodePtr new_child(HtmlTextNode::create());
		new_child->appendContent(begin, end);
		el->appendChild(new_child.get());
	}
}

void
HtmlProcessor::openingTagToStream(
	std::ostream& strm, HtmlNode const* node,
	ContentType ctype, bool explicit_empty)
{
	strm << '<' << node->getTagName();
	HtmlNode::AttributeMap::const_iterator it = node->getRawAttributes().begin();
	HtmlNode::AttributeMap::const_iterator const end = node->getRawAttributes().end();
	for (; it != end; ++it) {
		strm << ' ' << it->first;
		if (!it->second.null()) {
			strm << '=' << '"' << it->second << '"';
		} else {
			// boolean attribute
			if (ctype == CONTENT_XHTML) {
				strm << '=' << '"' << it->first << '"';
			}
		}
	}
	if (explicit_empty) {
		strm << " />";
	} else {
		strm << '>';
	}
}

HtmlProcessor::TagType
HtmlProcessor::getTagType(Iterator const& begin, Iterator const& end)
{
	static TagTypePair const type_map[] = {
		// sorted by tag name
		{ "a",        TAG_A },
		{ "area",     TAG_AREA },
		{ "base",     TAG_BASE },
		{ "embed",    TAG_EMBED },
		{ "iframe",   TAG_IFRAME },
		{ "img",      TAG_IMG },
		{ "map",      TAG_MAP },
		{ "noscript", TAG_NOSCRIPT },
		{ "object",   TAG_OBJECT },
		{ "param",    TAG_PARAM },
		{ "script",   TAG_SCRIPT },
		{ "style",    TAG_STYLE },
		{ "textarea", TAG_TEXTAREA }
	};
	assert(ARRAY_SIZE(type_map) == NUM_TAG_TYPES - 1); // -1 is for TAG_OTHER
	IPair val(begin, end);
	IPairIcaseComparator comp;
	TagTypePair const* found = std::lower_bound(
		&type_map[0], &type_map[ARRAY_SIZE(type_map)], val, comp
	);
	if (found != &type_map[ARRAY_SIZE(type_map)] && !comp(val, *found)) {
		return found->type;
	}
	return TAG_OTHER;
}

bool
HtmlProcessor::isStopperTag(int type)
{
	static bool const table[] = {
		false, // TAG_OTHER
		false, // TAG_A
		false, // TAG_AREA
		false, // TAG_BASE
		false, // TAG_EMBED
		true,  // TAG_IFRAME
		true,  // TAG_IMG
		false, // TAG_MAP
		true,  // TAG_NOSCRIPT
		true,  // TAG_OBJECT
		false, // TAG_PARAM
		true,  // TAG_SCRIPT
		false, // TAG_STYLE
		false  // TAG_TEXTAREA
	};
	assert(ARRAY_SIZE(table) == NUM_TAG_TYPES);
	return table[type];
}

bool
HtmlProcessor::isClosingTagForbidden(BString const& tname)
{
	static BStringPOD const tags[] = { // sorted
		{"img"}, {"param"}, {"area"}
	};
	return std::binary_search(&tags[0], &tags[ARRAY_SIZE(tags)], tname);
}

bool
HtmlProcessor::tagTerminatesAnchor(Iterator const& begin, Iterator const& end)
{
	static char const* const tags[] = { // sorted
		"a", "div", "table", "td", "tr"
	};
	return std::binary_search(
		&tags[0], &tags[ARRAY_SIZE(tags)],
		IPair(begin, end), IPairIcaseComparator()
	);
}

HtmlProcessor::ScriptFetchResult
HtmlProcessor::fetchScript(
	RequestPtr const& request, SplittableBuffer& output,
	size_t max_body_size, size_t max_fetch_size)
{
	ScriptFetchResult res;
	
	{
		URI const& uri = request->requestLine().getURI();
		GlobalState::ReadAccessor global_state;
		if (global_state->urlPatterns().getSubstitutionFor(uri)) {
			return res; // as if we fetched it but it was empty
		}
		if (global_state->urlPatterns().isFilteringDisabled(uri)) {
			res.errorDescriptor.reset(new ErrorDescriptor(int(ErrorCodes::SCRIPT_FETCH_NOFILTER)));
			return res;
		}
	}
	
	while (true) {
		output.clear();
		HttpFetcher fetcher(
			m_rContext.scriptServer(), request,
			max_body_size, max_fetch_size
		);
		bool follow_redirect = false;
		while (fetcher.getStatus() == HttpFetcher::IN_PROGRESS) {
			if (m_rContext.reactor().handleEvents() != Reactor::SUCCESS) {
				break;
			}
			
			if (HttpResponseMetadata const* metadata = fetcher.responseMetadata().get()) {
				if (metadata->statusLine().isRedirect()) {
					res.dynamicContent = true;
					if (++res.numRedirects > MAX_SCRIPT_FETCH_REDIRECTS) {
						res.errorDescriptor.reset(new ErrorDescriptor(
							ErrorCodes::TOO_MANY_REDIRECTS,
							"too many redirects"
						));
						break;
					}
					URI redirect(
						request->requestLine().getURI(),
						URI(metadata->headers().getHeader(BString("Location")).getValue())
					);
					request->requestLine().setURI(redirect);
					follow_redirect = true;
					break;
				}
			}
		}
		
		if (HttpResponseMetadata const* metadata = fetcher.responseMetadata().get()) {
			res.persistentCookies = (res.persistentCookies || hasPersistentCookies(*metadata));
			res.dynamicContent = (res.dynamicContent || isDynamicResource(*metadata));
		}
		
		if (follow_redirect) {
			continue;
		}
		
		if (!res.errorDescriptor.get()) {
			if (fetcher.getStatus() == HttpFetcher::COMPLETE) {
				int const status = fetcher.responseMetadata()->statusLine().getCode();
				if (status < 200 || status >= 400) {
					ostringstream strm;
					fetcher.responseMetadata()->statusLine().toStream(strm);
					string status(strm.str());
					status.resize(status.size() - 2); // remove the training \r\n
					res.errorDescriptor.reset(new ErrorDescriptor(
						ErrorCodes::WRONG_RESPONSE_STATUS, status
					));
				} else {
					output.appendDestructive(fetcher.responseBody());
				}
			} else if (fetcher.getStatus() == HttpFetcher::FAILED) {
				res.errorDescriptor = fetcher.errorDescriptor();
			} else {
				// Reactor::handle_events() returned -1?
				res.errorDescriptor.reset(new ErrorDescriptor(
					ErrorCodes::UNKNOWN_ERROR,
					"unknown error"
				));
			}
		}
		break;
	}
	
	return res;
}

RequestPtr
HtmlProcessor::prepareScriptFetchRequest(URI const& url)
{
	RequestPtr request;
	BString const& host1 = url.getHost();
	BString const& host2 = m_baseURL.getHost();
	if (StringUtils::ciEqual(host1.begin(), host1.end(), host2.begin(), host2.end())) {
		request.reset(new HttpRequestMetadata(*m_ptrRequestMetadata));
		// this will copy all of the headers including cookies and authentication
		request->requestLine().setMethod(BString("GET"));
		request->requestLine().setURI(url);
		request->headers().removeHeader(BString("Accept-Encoding"));
		request->headers().removeHeader(BString("Expect"));
		request->headers().removeHeader(BString("Content-Length"));
		request->headers().removeHeader(BString("TE"));
		request->headers().removeHeader(BString("Range"));
		request->headers().removeHeader(BString("If-Modified-Since"));
		request->headers().removeHeader(BString("If-None-Match"));
	} else {
		request.reset(new HttpRequestMetadata(
			HttpRequestLine(BString("GET"), url, HttpVersion::HTTP_1_1)
		));
		request->setClientRequestId(m_ptrRequestMetadata->getClientRequestId());
		if (HttpHeader const* hdr = m_ptrRequestMetadata->headers().
		    getHeaderPtr(BString("User-Agent"))) {
			request->headers().setHeader(*hdr);
		}
		if (HttpHeader const* hdr = m_ptrRequestMetadata->headers().
		    getHeaderPtr(BString("Pragma"))) {
			request->headers().setHeader(*hdr);
		}
		if (HttpHeader const* hdr = m_ptrRequestMetadata->headers().
		    getHeaderPtr(BString("Cache-Control"))) {
			request->headers().setHeader(*hdr);
		}
	}
	request->headers().setHeader(BString("Accept"), BString("*" "/" "*"));
	request->headers().setHeader(BString("Accept-Language"), BString("en-us,en;q=0.5"));
	request->headers().setHeader(BString("Referer"), BString(m_baseURL.toString()));
	request->setBodyStatus(HttpRequestMetadata::BODY_FORBIDDEN);
	return request;
}

bool
HtmlProcessor::hasPersistentCookies(HttpResponseMetadata const& metadata)
{
	if (HttpHeader const* hdr = metadata.headers().getHeaderPtr(BString("Set-Cookie"))) {
		list<BString> const& values = hdr->getValues();
		list<BString>::const_iterator it = values.begin();
		list<BString>::const_iterator const end = values.end();
		for (; it != end; ++it) {
			if (isPersistentCookie(it->toStdString())) {
				return true;
			}
		}
	}
	
	// there is also a Set-Cookie2 header, but nobody uses it
	
	return false;
}

// TODO: change the arg type to BString
bool
HtmlProcessor::isPersistentCookie(std::string const& str)
{
	static char const expires[] = "expires=";
	char const* begin = str.c_str();
	while ((begin = strstr(begin, expires))) {
		begin += sizeof(expires) - 1;
		char const* end = strchr(begin, ';');
		if (!end) {
			end = str.c_str() + str.size();
		}
		// now we have this kind of a string between <begin> and <end>:
		// Wdy, DD-Mon-YYYY HH::MM::SS GMT
		begin = StringUtils::find(begin, end, '-');
		if (begin == end) {
			continue;
		} 
		++begin;
		begin = StringUtils::find(begin, end, '-');
		if (begin == end) {
			continue;
		}
		++begin;
		char const* parse_end = end;
		int year = StringUtils::parseUnsigned<int>(begin, parse_end);
		begin = end;
		if (year > 2000) {
			// I am not in the mood to parse the whole date
			return true;
		}
	}
	return false;
}

bool
HtmlProcessor::isDynamicResource(HttpResponseMetadata const& metadata)
{
	return !metadata.headers().hasHeader(BString("Last-Modified"));
}

auto_ptr<HtmlProcessor::AbstractScriptOperation>
HtmlProcessor::createScriptSubstitution(HtmlNode* node, AdSuspect const& ad)
{
	BString const comment(
		"<!--\r\n[BFilter] Script removed [replacement follows].\r\n-->");
	return auto_ptr<AbstractScriptOperation>(
		new ReplaceBodyScriptOperation(
			*this, node, *ad.getJsSubstitution(), comment
		)
	);
}


/*=================== HtmlProcessor::ScriptContext ========================*/

HtmlProcessor::ScriptContext::ScriptContext(JsEnvironment& env)
:	m_nestLevel(0),
	m_innerHtmlLevel(0),
	m_flags(0),
	m_descendantScriptFetches(0),
	m_unclickableImageCount(0),
	m_localLinkCount(0),
	m_clickableTextSize(0),
	m_unclickableTextSize(0),
	m_scriptOutputSize(0),
	m_webbugTailEvidence(0),
	m_pSubProcessor(0),
	m_subprocessorOutputSize(0)
{
	env.setListener(*this);
}

HtmlProcessor::ScriptContext::~ScriptContext()
{
}

void
HtmlProcessor::ScriptContext::processJsOutput(char const* data, bool newline)
{
	if (m_innerHtmlLevel != 0) {
		// Scripts inserted using innerHTML can't use document.write().
		return;
	}
	
	if (m_scriptOutputSize == 0 && strncmp(data, "-->", 3) == 0) {
		m_webbugTailEvidence |= WEBBUG_TAIL_EV1;
	}
	
	size_t len = strlen(data);
	m_scriptOutputSize += len;
	if (m_pSubProcessor) {
		SplittableBuffer buf;
		{
			SBOutStream& temp_stream = m_pSubProcessor->tempStream();
			TempStreamScope scope(temp_stream);
			temp_stream.write(data, len);
			if (newline) {
				temp_stream << "\r\n";
			}
			temp_stream.swapData(buf);
		}
		m_pSubProcessor->consume(buf, false);
		consumeSubprocessorOutput();
	}
}

void
HtmlProcessor::ScriptContext::processOnLoadAssignment()
{
	setOnLoadAssigned();
}

void
HtmlProcessor::ScriptContext::processInnerHtmlAssignment(char const* data)
{
	discardSubprocessorOutput();
	
	if (!m_pSubProcessor) {
		// should not happen
		return;
	}
	
	SplittableBuffer buf;
	{
		SBOutStream& temp_stream = m_pSubProcessor->tempStream();
		TempStreamScope scope(temp_stream);
		temp_stream << data;
		temp_stream.swapData(buf);
	}
	
	SubProcessor subprocessor(*m_pSubProcessor, *this, SubProcessor::INNER_HTML);
	subprocessor.consume(buf, true);
}

void
HtmlProcessor::ScriptContext::finishScriptOutput()
{
	if (m_scriptOutputSize == 3) {
		m_webbugTailEvidence |= WEBBUG_TAIL_EV2;
	}
	if (m_pSubProcessor) {
		SplittableBuffer data;
		m_pSubProcessor->consume(data, true);
		consumeSubprocessorOutput();
		if (m_pSubProcessor->isLexerInTheMiddleOfSomething()) {
			// there probably was a script error
			discardSubprocessorOutput(); 
		}
	}
}

void
HtmlProcessor::ScriptContext::consumeSubprocessorOutput()
{
	assert(m_pSubProcessor);
	SplittableBuffer data;
	m_pSubProcessor->processedData().swapData(data);
	if (isSubprocessorOutputDiscarded()) {
		return;
	}
	if (data.empty()) {
		return;
	}
	size_t const data_size = data.size();
	if (m_subprocessorOutputSize + data_size > SCRIPT_OUTPUT_LIMIT) {
		discardSubprocessorOutput();
		return;
	}
	m_subprocessorOutput.appendDestructive(data);
	m_subprocessorOutputSize += data_size;
}

void
HtmlProcessor::ScriptContext::processNestedScriptUrl(URI const& url, URI const& page_url)
{
	typedef HeuristicScore HS;
	if (HS::getUrlRelationship(url, page_url) == HS::URLS_UNRELATED) {
		vector<URI>::iterator it(m_unrelatedNestedScripts.begin());
		vector<URI>::iterator const end(m_unrelatedNestedScripts.end());
		for (; it != end; ++it) {
			if (HS::getUrlRelationship(url, *it) == HS::URLS_UNRELATED) {
				m_flags |= TOO_MANY_UNRELATED_SCRIPTS;
				break;
			}
		}
		
		m_unrelatedNestedScripts.reserve(5);
		m_unrelatedNestedScripts.push_back(url);
	}
}


/*====================== HtmlProcessor::SubProcessor ======================*/

HtmlProcessor::SubProcessor::SubProcessor(
	HtmlProcessor& parent, ScriptContext& context, Type type)
:	HtmlProcessor(parent, context),
	m_rContext(context),
	m_oldNestLevel(context.getNestLevel()),
	m_oldInnerHtmlLevel(context.getInnerHtmlLevel()),
	m_pOldSubProcessor(context.getSubProcessor())
{
	m_rContext.setSubProcessor(this);
	m_rContext.setNestLevel(m_oldNestLevel + 1);
	if (type == INNER_HTML) {
		m_rContext.setInnerHtmlInvolved();
		m_rContext.setInnerHtmlLevel(m_oldInnerHtmlLevel + 1);
	}
	
}

HtmlProcessor::SubProcessor::~SubProcessor()
{
	m_rContext.setInnerHtmlLevel(m_oldInnerHtmlLevel);
	m_rContext.setNestLevel(m_oldNestLevel);
	m_rContext.setSubProcessor(m_pOldSubProcessor);
}


/*=============== HtmlProcessor::LeaveAsIsScriptOperation =================*/

HtmlProcessor::LeaveAsIsScriptOperation::LeaveAsIsScriptOperation(
	HtmlProcessor& processor, HtmlNode* node,
	SplittableBuffer const* code, BString const& comment)
:	m_rProcessor(processor),
	m_pNode(node),
	m_pCode(code),
	m_comment(comment)
{
}

HtmlProcessor::LeaveAsIsScriptOperation::~LeaveAsIsScriptOperation()
{
}

void
HtmlProcessor::LeaveAsIsScriptOperation::performOperation()
{
	if (m_pCode && m_pNode->hasAttribute(BString("src"))) {
		m_rProcessor.inlineExternalScript(m_pNode, *m_pCode);
	}
	HtmlNodePtr comment_node(HtmlTextNode::create());
	comment_node->appendContent(m_comment);
	m_pNode->insertPrevSibling(comment_node);
	m_rProcessor.processDocFragmentNode(m_pNode);
}


/*================ HtmlProcessor::RemoveScriptOperation ==================*/

HtmlProcessor::RemoveScriptOperation::RemoveScriptOperation(
	HtmlProcessor& processor, HtmlNode* node, BString const& comment)
:	m_rProcessor(processor),
	m_pNode(node),
	m_comment(comment)
{
}

HtmlProcessor::RemoveScriptOperation::~RemoveScriptOperation()
{
}

void
HtmlProcessor::RemoveScriptOperation::performOperation()
{
	m_rProcessor.processDocFragmentNodeReplacement(m_pNode, m_comment);
}


/*============== HtmlProcessor::ReplaceBodyScriptOperation ===============*/

HtmlProcessor::ReplaceBodyScriptOperation::ReplaceBodyScriptOperation(
	HtmlProcessor& processor, HtmlNode* node,
	SplittableBuffer const& new_body, BString const& comment)
:	m_rProcessor(processor),
	m_pNode(node),
	m_newBody(new_body),
	m_comment(comment)
{
}

HtmlProcessor::ReplaceBodyScriptOperation::~ReplaceBodyScriptOperation()
{
}

void
HtmlProcessor::ReplaceBodyScriptOperation::performOperation()
{
	m_pNode->removeAttribute(BString("src"));
	m_pNode->removeAllChildren();
	HtmlNodePtr content_node(HtmlTextNode::create());
	content_node->appendContent(m_newBody.begin(), m_newBody.end());
	m_pNode->appendChild(content_node.get());
	HtmlNodePtr comment_node(HtmlTextNode::create());
	comment_node->appendContent(m_comment);
	m_pNode->insertPrevSibling(comment_node);
	m_rProcessor.processDocFragmentNode(m_pNode);
}


/*================== HtmlProcessor::PageOpenListener ===================*/

HtmlProcessor::PageOpenListener::PageOpenListener(JsEnvironment& js_env)
{
	js_env.setPageOpenListener(*this);
}

HtmlProcessor::PageOpenListener::~PageOpenListener()
{
}

void
HtmlProcessor::PageOpenListener::processPageOpening(
	char const* url, char const* target)
{
	assert(url);
	assert(target);
	
	if (m_url.empty()) {
		m_url = BString(url, strlen(url));
		m_target = BString(target, strlen(target));
	}
}
