// htmlFilterOutputStream.java // --------------------------- // (C) by Michael Peter Christen; mc@yacy.net // first published on http://www.anomic.de // Frankfurt, Germany, 2004, 2005 // // $LastChangedDate$ // $LastChangedRevision$ // $LastChangedBy$ // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA /* This class implements an output stream. Any data written to that output is automatically parsed. After finishing with writing, the htmlFilter can be read out. */ package net.yacy.document.parser.html; import java.io.IOException; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.Writer; import java.nio.charset.Charset; import java.util.Enumeration; import java.util.Locale; import java.util.Properties; import java.util.Stack; import net.yacy.document.parser.html.ContentScraper.TagName; import net.yacy.kelondro.io.CharBuffer; public final class TransformerWriter extends Writer { public static final char lb = '<'; public static final char rb = '>'; public static final char dash = '-'; public static final char excl = '!'; public static final char singlequote = '\''; public static final char doublequote = '"'; private final OutputStream outStream; private OutputStreamWriter out; private CharBuffer buffer; private Stack tagStack; private final Scraper scraper; private boolean inSingleQuote; private boolean inDoubleQuote; private boolean inComment; private boolean binaryUnsuspect; private final boolean passbyIfBinarySuspect; public TransformerWriter( final OutputStream outStream, final Charset charSet, final Scraper scraper, final boolean passbyIfBinarySuspect ) { this(outStream, charSet, scraper, passbyIfBinarySuspect, 64); } public TransformerWriter( final OutputStream outStream, final Charset charSet, final Scraper scraper, final boolean passbyIfBinarySuspect, final int initialBufferSize ) { this.outStream = outStream; this.scraper = scraper; this.buffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, initialBufferSize); this.tagStack = new Stack(); this.inSingleQuote = false; this.inDoubleQuote = false; this.inComment = false; this.binaryUnsuspect = true; this.passbyIfBinarySuspect = passbyIfBinarySuspect; if (this.outStream != null) { this.out = new OutputStreamWriter(this.outStream,(charSet == null)?Charset.defaultCharset():charSet); } } public static char[] genTag0raw(final String tagname, final boolean opening, final char[] tagopts) { final CharBuffer bb = new CharBuffer(ContentScraper.MAX_DOCSIZE, tagname.length() + tagopts.length + 3); bb.append('<'); if (!opening) { bb.append('/'); } bb.append(tagname); if (tagopts.length > 0) { // if (tagopts[0] == (byte) 32) bb.append(tagopts); // else bb.append((byte) 32).append(tagopts); } bb.append('>'); final char[] result = bb.getChars(); bb.close(); return result; } public static char[] genTag1raw(final String tagname, final char[] tagopts, final char[] text) { final CharBuffer bb = new CharBuffer(ContentScraper.MAX_DOCSIZE, 2 * tagname.length() + tagopts.length + text.length + 5); bb.append('<').append(tagname); if (tagopts.length > 0) { // if (tagopts[0] == (byte) 32) bb.append(tagopts); // else bb.append((byte) 32).append(tagopts); } bb.append('>'); bb.append(text); bb.append('<').append('/').append(tagname).append('>'); final char[] result = bb.getChars(); bb.close(); return result; } public static char[] genTag0(final String tagname, final Properties tagopts, final char quotechar) { final char[] tagoptsx = (tagopts.isEmpty()) ? null : genOpts(tagopts, quotechar); final CharBuffer bb = new CharBuffer(ContentScraper.MAX_DOCSIZE, tagname.length() + ((tagoptsx == null) ? 0 : (tagoptsx.length + 1)) + tagname.length() + 2); bb.append('<').append(tagname); if (tagoptsx != null) { bb.appendSpace(); bb.append(tagoptsx); } bb.append('>'); final char[] result = bb.getChars(); bb.close(); return result; } public static char[] genTag1(final String tagname, final Properties tagopts, final char[] text, final char quotechar) { final char[] gt0 = genTag0(tagname, tagopts, quotechar); final CharBuffer cb = new CharBuffer(ContentScraper.MAX_DOCSIZE, gt0, gt0.length + text.length + tagname.length() + 3); cb.append(text).append('<').append('/').append(tagname).append('>'); final char[] result = cb.getChars(); cb.close(); return result; } // a helper method for pretty-printing of properties for html tags public static char[] genOpts(final Properties prop, final char quotechar) { final Enumeration e = prop.propertyNames(); final CharBuffer bb = new CharBuffer(ContentScraper.MAX_DOCSIZE, prop.size() * 40); String key; while (e.hasMoreElements()) { key = (String) e.nextElement(); bb.appendSpace().append(key).append('=').append(quotechar); bb.append(prop.getProperty(key)); bb.append(quotechar); } final char[] result; if (bb.length() > 0) result = bb.getChars(1); else result = bb.getChars(); bb.close(); return result; } /** * the token processor distinguishes three different types of input: opening tag, closing tag, text content * @param in - the token to be processed * @param quotechar * @return a processed version of the token */ private char[] tokenProcessor(final char[] in, final char quotechar) { if (in.length == 0) return in; // scan the string and parse structure if (in.length <= 2 || in[0] != lb) return filterTag(in); // this is a text // this is a tag String tag; int tagend; if (in[1] == '/') { // a closing tag tagend = tagEnd(in, 2); tag = new String(in, 2, tagend - 2).toLowerCase(Locale.ROOT); final char[] text = new char[in.length - tagend - 1]; System.arraycopy(in, tagend, text, 0, in.length - tagend - 1); return filterTag(text, quotechar, tag, false); } // don't add text from within