Source Code of org.apache.wicket.markup.parser.XmlPullParser

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.wicket.markup.parser;


import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.text.ParseException;


import org.apache.wicket.markup.parser.XmlTag.TagType;
import org.apache.wicket.markup.parser.XmlTag.TextSegment;
import org.apache.wicket.util.io.FullyBufferedReader;
import org.apache.wicket.util.io.IOUtils;
import org.apache.wicket.util.io.XmlReader;
import org.apache.wicket.util.lang.Args;
import org.apache.wicket.util.parse.metapattern.parsers.TagNameParser;
import org.apache.wicket.util.parse.metapattern.parsers.VariableAssignmentParser;
import org.apache.wicket.util.string.Strings;


/**
 * A fairly shallow markup pull parser which parses a markup string of a given type of markup (for
 * example, html, xml, vxml or wml) into ComponentTag and RawMarkup tokens.
 * 
 * @author Jonathan Locke
 * @author Juergen Donnerstag
 */
public final class XmlPullParser implements IXmlPullParser
{
  /** */
  public static final String STYLE = "style";


  /** */
  public static final String SCRIPT = "script";


  /**
   * The encoding of the XML.
   */
  private String encoding;


  /**
   * A XML independent reader which loads the whole source data into memory and which provides
   * convenience methods to access the data.
   */
  private FullyBufferedReader input;


  /** temporary variable which will hold the name of the closing tag. */
  private String skipUntilText;


  /** The last substring selected from the input */
  private CharSequence lastText;


  /** Everything in between &lt;!DOCTYPE ... &gt; */
  private CharSequence doctype;


  /** The type of what is in lastText */
  private HttpTagType lastType = HttpTagType.NOT_INITIALIZED;


  /** The last tag found */
  private XmlTag lastTag;


  /**
   * Construct.
   */
  public XmlPullParser()
  {
  }


  @Override
  public final String getEncoding()
  {
    return encoding;
  }


  @Override
  public final CharSequence getDoctype()
  {
    return doctype;
  }


  @Override
  public final CharSequence getInputFromPositionMarker(final int toPos)
  {
    return input.getSubstring(toPos);
  }


  @Override
  public final CharSequence getInput(final int fromPos, final int toPos)
  {
    return input.getSubstring(fromPos, toPos);
  }


  /**
   * Whatever will be in between the current index and the closing tag, will be ignored (and thus
   * treated as raw markup (text). This is useful for tags like 'script'.
   * 
   * @throws ParseException
   */
  private void skipUntil() throws ParseException
  {
    // this is a tag with non-XHTML text as body - skip this until the
    // skipUntilText is found.
    final int startIndex = input.getPosition();
    final int tagNameLen = skipUntilText.length();


    int pos = input.getPosition() - 1;
    String endTagText = null;
    int lastPos = 0;
    while (!skipUntilText.equalsIgnoreCase(endTagText))
    {
      pos = input.find("</", pos + 1);
      if ((pos == -1) || ((pos + (tagNameLen + 2)) >= input.size()))
      {
        throw new ParseException(
          skipUntilText + " tag not closed" + getLineAndColumnText(), startIndex);
      }


      lastPos = pos + 2;
      endTagText = input.getSubstring(lastPos, lastPos + tagNameLen).toString();
    }


    input.setPosition(pos);
    lastText = input.getSubstring(startIndex, pos);
    lastType = HttpTagType.BODY;


    // Check that the tag is properly closed
    lastPos = input.find('>', lastPos + tagNameLen);
    if (lastPos == -1)
    {
      throw new ParseException(skipUntilText + " tag not closed" + getLineAndColumnText(),
        startIndex);
    }


    // Reset the state variable
    skipUntilText = null;
  }


  /**
   * 
   * @return line and column number
   */
  private String getLineAndColumnText()
  {
    return " (line " + input.getLineNumber() + ", column " + input.getColumnNumber() + ")";
  }


  /**
   * @return XXX
   * @throws ParseException
   */
  @Override
  public final HttpTagType next() throws ParseException
  {
    // Reached end of markup file?
    if (input.getPosition() >= input.size())
    {
      return HttpTagType.NOT_INITIALIZED;
    }


    if (skipUntilText != null)
    {
      skipUntil();
      return lastType;
    }


    // Any more tags in the markup?
    final int openBracketIndex = input.find('<');


    // Tag or Body?
    if (input.charAt(input.getPosition()) != '<')
    {
      // It's a BODY
      if (openBracketIndex == -1)
      {
        // There is no next matching tag.
        lastText = input.getSubstring(-1);
        input.setPosition(input.size());
        lastType = HttpTagType.BODY;
        return lastType;
      }


      lastText = input.getSubstring(openBracketIndex);
      input.setPosition(openBracketIndex);
      lastType = HttpTagType.BODY;
      return lastType;
    }


    // Determine the line number
    input.countLinesTo(openBracketIndex);


    // Get index of closing tag and advance past the tag
    int closeBracketIndex = -1;


    if (openBracketIndex != -1 && openBracketIndex < input.size() - 1)
    {
      char nextChar = input.charAt(openBracketIndex + 1);


      if ((nextChar == '!') || (nextChar == '?'))
        closeBracketIndex = input.find('>', openBracketIndex);
      else
        closeBracketIndex = input.findOutOfQuotes('>', openBracketIndex);
    }


    if (closeBracketIndex == -1)
    {
      throw new ParseException("No matching close bracket at" + getLineAndColumnText(),
        input.getPosition());
    }


    // Get the complete tag text
    lastText = input.getSubstring(openBracketIndex, closeBracketIndex + 1);


    // Get the tagtext between open and close brackets
    String tagText = lastText.subSequence(1, lastText.length() - 1).toString();
    if (tagText.length() == 0)
    {
      throw new ParseException("Found empty tag: '<>' at" + getLineAndColumnText(),
        input.getPosition());
    }


    // Type of the tag, to be determined next
    final TagType type;


    // If the tag ends in '/', it's a "simple" tag like <foo/>
    if (tagText.endsWith("/"))
    {
      type = TagType.OPEN_CLOSE;
      tagText = tagText.substring(0, tagText.length() - 1);
    }
    else if (tagText.startsWith("/"))
    {
      // The tag text starts with a '/', it's a simple close tag
      type = TagType.CLOSE;
      tagText = tagText.substring(1);
    }
    else
    {
      // It must be an open tag
      type = TagType.OPEN;


      // If open tag and starts with "s" like "script" or "style", than ...
      if ((tagText.length() > STYLE.length()) &&
        ((tagText.charAt(0) == 's') || (tagText.charAt(0) == 'S')))
      {
        final String lowerCase = tagText.toLowerCase();
        if (lowerCase.startsWith(SCRIPT))
        {
          String typeAttr = "type=";
          int idxOfType = lowerCase.indexOf(typeAttr);
          if (idxOfType > 0)
          {
            // +1 to remove the ' or "
            String typePrefix = lowerCase.substring(idxOfType + typeAttr.length() + 1);
            if (typePrefix.startsWith("text/javascript"))
            {
              // prepare to skip everything between the open and close tag
              skipUntilText = SCRIPT;
            }
            // any other type is assumed to be a template so it can contain child nodes.
            // See WICKET-5288
          }
          else
          {
            // no type attribute so it is 'text/javascript'
            // prepare to skip everything between the open and close tag
            skipUntilText = SCRIPT;
          }
        }
        else if (lowerCase.startsWith(STYLE))
        {
          // prepare to skip everything between the open and close tag
          skipUntilText = STYLE;
        }
      }
    }


    // Handle special tags like <!-- and <![CDATA ...
    final char firstChar = tagText.charAt(0);
    if ((firstChar == '!') || (firstChar == '?'))
    {
      specialTagHandling(tagText, openBracketIndex, closeBracketIndex);


      input.countLinesTo(openBracketIndex);
      TextSegment text = new TextSegment(lastText, openBracketIndex, input.getLineNumber(),
        input.getColumnNumber());
      lastTag = new XmlTag(text, type);


      return lastType;
    }


    TextSegment text = new TextSegment(lastText, openBracketIndex, input.getLineNumber(),
      input.getColumnNumber());
    XmlTag tag = new XmlTag(text, type);
    lastTag = tag;


    // Parse the tag text and populate tag attributes
    if (parseTagText(tag, tagText))
    {
      // Move to position after the tag
      input.setPosition(closeBracketIndex + 1);
      lastType = HttpTagType.TAG;
      return lastType;
    }
    else
    {
      throw new ParseException("Malformed tag" + getLineAndColumnText(), openBracketIndex);
    }
  }


  /**
   * Handle special tags like <!-- --> or <![CDATA[..]]> or <?xml>
   * 
   * @param tagText
   * @param openBracketIndex
   * @param closeBracketIndex
   * @throws ParseException
   */
  protected void specialTagHandling(String tagText, final int openBracketIndex,
    int closeBracketIndex) throws ParseException
  {
    // Handle comments
    if (tagText.startsWith("!--"))
    {
      // downlevel-revealed conditional comments e.g.: <!--[if (gt IE9)|!(IE)]><!-->
      if (tagText.contains("![endif]--"))
      {
        lastType = HttpTagType.CONDITIONAL_COMMENT_ENDIF;


        // Move to position after the tag
        input.setPosition(closeBracketIndex + 1);
        return;
      }


      // Conditional comment? E.g.
      // "<!--[if IE]><a href='test.html'>my link</a><![endif]-->"
      if (tagText.startsWith("!--[if ") && tagText.endsWith("]"))
      {
        int pos = input.find("]-->", openBracketIndex + 1);
        if (pos == -1)
        {
          throw new ParseException("Unclosed conditional comment beginning at" +
            getLineAndColumnText(), openBracketIndex);
        }


        pos += 4;
        lastText = input.getSubstring(openBracketIndex, pos);


        // Actually it is no longer a comment. It is now
        // up to the browser to select the section appropriate.
        input.setPosition(closeBracketIndex + 1);
        lastType = HttpTagType.CONDITIONAL_COMMENT;
      }
      else
      {
        // Normal comment section.
        // Skip ahead to "-->". Note that you can not simply test for
        // tagText.endsWith("--") as the comment might contain a '>'
        // inside.
        int pos = input.find("-->", openBracketIndex + 1);
        if (pos == -1)
        {
          throw new ParseException("Unclosed comment beginning at" +
            getLineAndColumnText(), openBracketIndex);
        }


        pos += 3;
        lastText = input.getSubstring(openBracketIndex, pos);
        lastType = HttpTagType.COMMENT;
        input.setPosition(pos);
      }
      return;
    }


    // The closing tag of a conditional comment, e.g.
    // "<!--[if IE]><a href='test.html'>my link</a><![endif]-->
    // and also <!--<![endif]-->"
    if (tagText.equals("![endif]--"))
    {
      lastType = HttpTagType.CONDITIONAL_COMMENT_ENDIF;
      input.setPosition(closeBracketIndex + 1);
      return;
    }


    // CDATA sections might contain "<" which is not part of an XML tag.
    // Make sure escaped "<" are treated right
    if (tagText.startsWith("!["))
    {
      final String startText = (tagText.length() <= 8 ? tagText : tagText.substring(0, 8));
      if (startText.toUpperCase().equals("![CDATA["))
      {
        int pos1 = openBracketIndex;
        do
        {
          // Get index of closing tag and advance past the tag
          closeBracketIndex = findChar('>', pos1);


          if (closeBracketIndex == -1)
          {
            throw new ParseException("No matching close bracket at" +
              getLineAndColumnText(), input.getPosition());
          }


          // Get the tagtext between open and close brackets
          tagText = input.getSubstring(openBracketIndex + 1, closeBracketIndex)
            .toString();


          pos1 = closeBracketIndex + 1;
        }
        while (tagText.endsWith("]]") == false);


        // Move to position after the tag
        input.setPosition(closeBracketIndex + 1);


        lastText = tagText;
        lastType = HttpTagType.CDATA;
        return;
      }
    }


    if (tagText.charAt(0) == '?')
    {
      lastType = HttpTagType.PROCESSING_INSTRUCTION;


      // Move to position after the tag
      input.setPosition(closeBracketIndex + 1);
      return;
    }


    if (tagText.startsWith("!DOCTYPE"))
    {
      lastType = HttpTagType.DOCTYPE;


      // Get the tagtext between open and close brackets
      doctype = input.getSubstring(openBracketIndex + 1, closeBracketIndex);


      // Move to position after the tag
      input.setPosition(closeBracketIndex + 1);
      return;
    }


    // Move to position after the tag
    lastType = HttpTagType.SPECIAL_TAG;
    input.setPosition(closeBracketIndex + 1);
  }


  /**
   * @return MarkupElement
   */
  @Override
  public final XmlTag getElement()
  {
    return lastTag;
  }


  /**
   * @return The xml string from the last element
   */
  @Override
  public final CharSequence getString()
  {
    return lastText;
  }


  /**
   * @return The next XML tag
   * @throws ParseException
   */
  public final XmlTag nextTag() throws ParseException
  {
    while (next() != HttpTagType.NOT_INITIALIZED)
    {
      switch (lastType)
      {
        case TAG :
          return lastTag;


        case BODY :
          break;


        case COMMENT :
          break;


        case CONDITIONAL_COMMENT :
          break;


        case CDATA :
          break;


        case PROCESSING_INSTRUCTION :
          break;


        case SPECIAL_TAG :
          break;
      }
    }


    return null;
  }


  /**
   * Find the char but ignore any text within ".." and '..'
   * 
   * @param ch
   *            The character to search
   * @param startIndex
   *            Start index
   * @return -1 if not found, else the index
   */
  private int findChar(final char ch, int startIndex)
  {
    char quote = 0;


    for (; startIndex < input.size(); startIndex++)
    {
      final char charAt = input.charAt(startIndex);
      if (quote != 0)
      {
        if (quote == charAt)
        {
          quote = 0;
        }
      }
      else if ((charAt == '"') || (charAt == '\''))
      {
        quote = charAt;
      }
      else if (charAt == ch)
      {
        return startIndex;
      }
    }


    return -1;
  }


  /**
   * Parse the given string.
   * <p>
   * Note: xml character encoding is NOT applied. It is assumed the input provided does have the
   * correct encoding already.
   * 
   * @param string
   *            The input string
   * @throws IOException
   *             Error while reading the resource
   */
  @Override
  public void parse(final CharSequence string) throws IOException
  {
    Args.notNull(string, "string");


    this.input = new FullyBufferedReader(new StringReader(string.toString()));
    this.encoding = null;
  }


  /**
   * Reads and parses markup from an input stream, using UTF-8 encoding by default when not
   * specified in XML declaration.
   * 
   * @param in
   *            The input stream to read and parse
   * @throws IOException
   * 
   * @see {@link #parse(InputStream, String)}
   */
  @Override
  public void parse(final InputStream in) throws IOException
  {
    // When XML declaration does not specify encoding, it defaults to UTF-8
    parse(in, "UTF-8");
  }


  /**
   * Reads and parses markup from an input stream.
   * <p>
   * Note: The input is closed after parsing.
   * 
   * @param inputStream
   *            The input stream to read and parse
   * @param encoding
   *            The default character encoding of the input
   * @throws IOException
   */
  @Override
  public void parse(final InputStream inputStream, final String encoding) throws IOException
  {
    Args.notNull(inputStream, "inputStream");


    try
    {
      XmlReader xmlReader = new XmlReader(new BufferedInputStream(inputStream, 4000),
        encoding);
      this.input = new FullyBufferedReader(xmlReader);
      this.encoding = xmlReader.getEncoding();
    }
    finally
    {
      IOUtils.closeQuietly(inputStream);
    }
  }


  @Override
  public final void setPositionMarker()
  {
    input.setPositionMarker(input.getPosition());
  }


  @Override
  public final void setPositionMarker(final int pos)
  {
    input.setPositionMarker(pos);
  }


  @Override
  public String toString()
  {
    return input.toString();
  }


  /**
   * Parses the text between tags. For example, "a href=foo.html".
   * 
   * @param tag
   * @param tagText
   *            The text between tags
   * @return false in case of an error
   * @throws ParseException
   */
  private boolean parseTagText(final XmlTag tag, final String tagText) throws ParseException
  {
    // Get the length of the tagtext
    final int tagTextLength = tagText.length();


    // If we match tagname pattern
    final TagNameParser tagnameParser = new TagNameParser(tagText);
    if (tagnameParser.matcher().lookingAt())
    {
      // Extract the tag from the pattern matcher
      tag.name = tagnameParser.getName();
      tag.namespace = tagnameParser.getNamespace();


      // Are we at the end? Then there are no attributes, so we just
      // return the tag
      int pos = tagnameParser.matcher().end(0);
      if (pos == tagTextLength)
      {
        return true;
      }


      // Extract attributes
      final VariableAssignmentParser attributeParser = new VariableAssignmentParser(tagText);
      while (attributeParser.matcher().find(pos))
      {
        // Get key and value using attribute pattern
        String value = attributeParser.getValue();


        // In case like <html xmlns:wicket> will the value be null
        if (value == null)
        {
          value = "";
        }


        // Set new position to end of attribute
        pos = attributeParser.matcher().end(0);


        // Chop off double quotes or single quotes
        if (value.startsWith("\"") || value.startsWith("\'"))
        {
          value = value.substring(1, value.length() - 1);
        }


        // Trim trailing whitespace
        value = value.trim();


        // Unescape
        value = Strings.unescapeMarkup(value).toString();


        // Get key
        final String key = attributeParser.getKey();


        // Put the attribute in the attributes hash
        if (null != tag.getAttributes().put(key, value))
        {
          throw new ParseException("Same attribute found twice: " + key +
            getLineAndColumnText(), input.getPosition());
        }


        // The input has to match exactly (no left over junk after
        // attributes)
        if (pos == tagTextLength)
        {
          return true;
        }
      }


      return true;
    }


    return false;
  }
}
Source Code of org.apache.wicket.markup.parser.XmlPullParser

Related Classes of org.apache.wicket.markup.parser.XmlPullParser