// htmlFilterOutputStream.java
// ---------------------------
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004, 2005
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
/*
This class implements an output stream. Any data written to that output
is automatically parsed.
After finishing with writing, the htmlFilter can be read out.
*/
package net.yacy.document.parser.html;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.Charset;
import java.util.Enumeration;
import java.util.Locale;
import java.util.Properties;
import java.util.Stack;
import net.yacy.document.parser.html.ContentScraper.TagName;
import net.yacy.kelondro.io.CharBuffer;
public final class TransformerWriter extends Writer {
public static final char lb = '<';
public static final char rb = '>';
public static final char dash = '-';
public static final char excl = '!';
public static final char singlequote = '\'';
public static final char doublequote = '"';
private final OutputStream outStream;
private OutputStreamWriter out;
private CharBuffer buffer;
private Stack tagStack;
private final Scraper scraper;
private boolean inSingleQuote;
private boolean inDoubleQuote;
private boolean inComment;
private boolean binaryUnsuspect;
private final boolean passbyIfBinarySuspect;
public TransformerWriter(
final OutputStream outStream,
final Charset charSet,
final Scraper scraper,
final boolean passbyIfBinarySuspect
) {
this(outStream, charSet, scraper, passbyIfBinarySuspect, 64);
}
public TransformerWriter(
final OutputStream outStream,
final Charset charSet,
final Scraper scraper,
final boolean passbyIfBinarySuspect,
final int initialBufferSize
) {
this.outStream = outStream;
this.scraper = scraper;
this.buffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, initialBufferSize);
this.tagStack = new Stack();
this.inSingleQuote = false;
this.inDoubleQuote = false;
this.inComment = false;
this.binaryUnsuspect = true;
this.passbyIfBinarySuspect = passbyIfBinarySuspect;
if (this.outStream != null) {
this.out = new OutputStreamWriter(this.outStream,(charSet == null)?Charset.defaultCharset():charSet);
}
}
public static char[] genTag0raw(final String tagname, final boolean opening, final char[] tagopts) {
final CharBuffer bb = new CharBuffer(ContentScraper.MAX_DOCSIZE, tagname.length() + tagopts.length + 3);
bb.append('<');
if (!opening) {
bb.append('/');
}
bb.append(tagname);
if (tagopts.length > 0) {
// if (tagopts[0] == (byte) 32)
bb.append(tagopts);
// else bb.append((byte) 32).append(tagopts);
}
bb.append('>');
final char[] result = bb.getChars();
bb.close();
return result;
}
public static char[] genTag1raw(final String tagname, final char[] tagopts, final char[] text) {
final CharBuffer bb = new CharBuffer(ContentScraper.MAX_DOCSIZE, 2 * tagname.length() + tagopts.length + text.length + 5);
bb.append('<').append(tagname);
if (tagopts.length > 0) {
// if (tagopts[0] == (byte) 32)
bb.append(tagopts);
// else bb.append((byte) 32).append(tagopts);
}
bb.append('>');
bb.append(text);
bb.append('<').append('/').append(tagname).append('>');
final char[] result = bb.getChars();
bb.close();
return result;
}
public static char[] genTag0(final String tagname, final Properties tagopts, final char quotechar) {
final char[] tagoptsx = (tagopts.isEmpty()) ? null : genOpts(tagopts, quotechar);
final CharBuffer bb = new CharBuffer(ContentScraper.MAX_DOCSIZE, tagname.length() + ((tagoptsx == null) ? 0 : (tagoptsx.length + 1)) + tagname.length() + 2);
bb.append('<').append(tagname);
if (tagoptsx != null) {
bb.appendSpace();
bb.append(tagoptsx);
}
bb.append('>');
final char[] result = bb.getChars();
bb.close();
return result;
}
public static char[] genTag1(final String tagname, final Properties tagopts, final char[] text, final char quotechar) {
final char[] gt0 = genTag0(tagname, tagopts, quotechar);
final CharBuffer cb = new CharBuffer(ContentScraper.MAX_DOCSIZE, gt0, gt0.length + text.length + tagname.length() + 3);
cb.append(text).append('<').append('/').append(tagname).append('>');
final char[] result = cb.getChars();
cb.close();
return result;
}
// a helper method for pretty-printing of properties for html tags
public static char[] genOpts(final Properties prop, final char quotechar) {
final Enumeration> e = prop.propertyNames();
final CharBuffer bb = new CharBuffer(ContentScraper.MAX_DOCSIZE, prop.size() * 40);
String key;
while (e.hasMoreElements()) {
key = (String) e.nextElement();
bb.appendSpace().append(key).append('=').append(quotechar);
bb.append(prop.getProperty(key));
bb.append(quotechar);
}
final char[] result;
if (bb.length() > 0)
result = bb.getChars(1);
else
result = bb.getChars();
bb.close();
return result;
}
/**
* the token processor distinguishes three different types of input: opening tag, closing tag, text content
* @param in - the token to be processed
* @param quotechar
* @return a processed version of the token
*/
private char[] tokenProcessor(final char[] in, final char quotechar) {
if (in.length == 0) return in;
// scan the string and parse structure
if (in.length <= 2 || in[0] != lb) return filterTag(in); // this is a text
// this is a tag
String tag;
int tagend;
if (in[1] == '/') {
// a closing tag
tagend = tagEnd(in, 2);
tag = new String(in, 2, tagend - 2).toLowerCase(Locale.ROOT);
final char[] text = new char[in.length - tagend - 1];
System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
return filterTag(text, quotechar, tag, false);
}
// don't add text from within