Package com.google.i18n.pseudolocalization.methods

Source Code of com.google.i18n.pseudolocalization.methods.HtmlPreserver

/*
* Copyright 2011 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.google.i18n.pseudolocalization.methods;

import com.google.i18n.pseudolocalization.PseudolocalizationMethod;
import com.google.i18n.pseudolocalization.PseudolocalizationPipeline;
import com.google.i18n.pseudolocalization.message.DefaultVisitor;
import com.google.i18n.pseudolocalization.message.MessageFragment;
import com.google.i18n.pseudolocalization.message.TextFragment;
import com.google.i18n.pseudolocalization.message.VisitorContext;

import org.htmlparser.Attribute;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.Text;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.NodeVisitor;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* A pseudolocalization method that attempts to parse HTML and leave the tags
* and their attributes alone, while passing other content through the rest of
* the pseudolocalization chain. This should generally be the first method on
* the chain.
*/
public class HtmlPreserver extends DefaultVisitor implements PseudolocalizationMethod {

  /**
   * Regex to match HTML entities - see the
   * <a href="http://www.w3.org/TR/html4/charset.html#entities">HTML4 spec</a>.
   */
  private static final Pattern ENTITY_PATTERN = Pattern.compile(
      "&((#\\d+)|(#[xX][a-fA-F0-9]+)|(\\w+));");

  private static final Set<String> LOCALIZABLE_ATTRIBUTES;

  static {
    LOCALIZABLE_ATTRIBUTES = new HashSet<String>();
    // TODO: some automated way of generating this list
    LOCALIZABLE_ATTRIBUTES.add("alt");
    LOCALIZABLE_ATTRIBUTES.add("title");
    LOCALIZABLE_ATTRIBUTES.add("button/value");
    LOCALIZABLE_ATTRIBUTES.add("input/value");
    LOCALIZABLE_ATTRIBUTES.add("option/label");
    LOCALIZABLE_ATTRIBUTES.add("optgroup/label");
  }

  public static void register() {
    PseudolocalizationPipeline.registerMethodClass("html", HtmlPreserver.class);
  }

  @Override
  public void visitTextFragment(final VisitorContext ctx, TextFragment text) {
    final List<MessageFragment> result = new ArrayList<MessageFragment>();
    Parser parser = Parser.createParser(text.getText(), "UTF-8");
    try {
      parser.visitAllNodesWith(new NodeVisitor() {
        @Override
        public void visitEndTag(Tag tag) {
          result.add(ctx.createNonlocalizableTextFragment(tag.toHtml(true)));
        }

        @Override
        public void visitStringNode(Text string) {
          String val = string.getText();
          Matcher m = ENTITY_PATTERN.matcher(val);
          int start = 0;
          while (m.find()) {
            String plainText = val.substring(start, m.start());
            start = m.end();
            String entity = m.group();
            if (plainText.length() > 0) {
              result.add(ctx.createTextFragment(plainText));
            }
            result.add(ctx.createNonlocalizableTextFragment(entity));
          }
          String plainText = val.substring(start);
          if (plainText.length() > 0) {
            result.add(ctx.createTextFragment(plainText));
          }
        }

        @Override
        public void visitTag(Tag tag) {
          // Convert a tag into a sequence of fragments, which always start and
          // end with a non-localizable text fragment but may have localizable
          // text fragments in the middle.  For example,
          //    <input value="Submit">
          // becomes:
          //   [ NLTF('<input value="'), LTF('Submit'), NLTF('">') ]
         
          // Warning: this is derived from tag.toTagHtml, and may need to be
          // modified if HtmlParser is updated.
          @SuppressWarnings("unchecked")
          Vector<Attribute> attributes = tag.getAttributesEx();
          int n = attributes.size();
          StringBuffer buf = new StringBuffer();
          buf.append("<");
          String tagName = null;
          for (int i = 0; i < n; ++i){
              Attribute attribute = attributes.elementAt(i);
              if (i == 0) {
                tagName = attribute.getName();
              } else if (isLocalizableAttribute(tagName, attribute)) {
                attribute.getName(buf);
                attribute.getAssignment(buf);
                char quote = attribute.getQuote();
                if (quote != 0) {
                  buf.append(quote);
                }
                if (buf.length() > 0) {
                  result.add(ctx.createNonlocalizableTextFragment(buf.toString()));
                }
                result.add(ctx.createTextFragment(attribute.getValue()));
                buf = new StringBuffer();
                if (quote != 0) {
                  buf.append(quote);
                }
                continue;
              }
              attribute.toString(buf);
          }
          buf.append(">");
          result.add(ctx.createNonlocalizableTextFragment(buf.toString()));
        }

        /**
         * Check if a particular attribute contains localizable content.
         *
         * @param tagName
         * @param attribute
         * @return true if the attribute's value is localizable, false otherwise
         */
        private boolean isLocalizableAttribute(String tagName, Attribute attribute) {
          if (!attribute.isValued()) {
            return false;
          }
          String attrName = attribute.getName();
          return LOCALIZABLE_ATTRIBUTES.contains(attrName)
              || LOCALIZABLE_ATTRIBUTES.contains(tagName + "/" + attrName);
        }
      });
      ctx.replaceFragment(text, result);
    } catch (ParserException e) {
      // ignore parse failures
    }
    // if we can't parse it, leave it unchanged
  }
}
TOP

Related Classes of com.google.i18n.pseudolocalization.methods.HtmlPreserver

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.