Package com.cloudera.flume.handlers.text

Source Code of com.cloudera.flume.handlers.text.ApacheAccessLogFormat

/**
* Licensed to Cloudera, Inc. under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  Cloudera, Inc. licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cloudera.flume.handlers.text;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.cloudera.flume.core.Event;
import com.cloudera.flume.core.EventImpl;
import com.cloudera.flume.core.Event.Priority;
import com.cloudera.util.Clock;
import com.cloudera.util.NetUtils;

/**
* Apache Access Logs are configurable to have custom formats. The default
* (which we parse here) is :
*
* Default configured to be: LogFormat "%h %l %u %t \"%r\" %>s %b" common
*
* which means:
*
* %h ip of remote host
*
* %l identd check (usually '-' because is usually unreliable data)
*
* %u user according to HTTP Authentication. '-' if no pw
*
* %t time request received in apache date format.
*
* %r request string. The format has it in quotes.
*
* %>s status code sent back to client
*
* %b size of object sent to client not including header. '-' here means 0 data.
*
* Others escape sequences are explained here
* http://httpd.apache.org/docs/2.0/logs.html
*
* Another common one is the CombinedLogFormat which adds two fields:
*
* LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\""
* combined
*
* TODO (jon) Add support for parsing based on Apache LogFormat directive
* http://httpd.apache.org/docs/1.3/mod/mod_log_config.html#formats
*
* TODO (jon) These links talk about the cost incurred by the Calendar class.
* Part of it is due to the fact that it is not thread safe. The extract method
* is very likely to run in multiple threads at some point so this will become
* an issue.
*
* http://blog.bielu.com/2008/08/javautilcalendar-confusion-is-it-safe_28.html
*
* The article links to the library below -- apparently to be standard as part
* of java 7.
*
* http://joda-time.sourceforge.net/
*
* TODO (jon) implement apache format outputter.
*/
public class ApacheAccessLogFormat implements InputFormat {
  static final Logger LOG = LoggerFactory.getLogger(ApacheAccessLogFormat.class);

  final static Pattern APACHE_PAT = Pattern
      .compile("^(\\S+) (\\S+) (\\S+) \\[(.*?)\\] \\\"(.*?)\\\" (\\S+) (\\S+)( \\\"(.*?)\\\" \\\"(.*?)\\\")?$");
  final SimpleDateFormat APACHE_DF = new SimpleDateFormat(
      "dd/MMM/yyyy:HH:mm:ss zzzzz");

  /**
   * This attempts returns null if a line fails to be parsed.
   */
  public Event extract(String s) {
    Matcher m = APACHE_PAT.matcher(s);
    if (!m.matches())
      return null;

    try {
      String service = "apache";
      String date = m.group(4);
      Date d = APACHE_DF.parse(date);
      Calendar c = Calendar.getInstance();
      c.setTime(d);
      d = c.getTime();

      String host = NetUtils.localhost(); // local host

      // TODO(jon) body should be the raw entry, and a new field should be
      // created for this instead.
      String body = m.group(5);
      String client = m.group(1);
      String res = m.group(6);
      String size = m.group(7);

      String referrer = m.group(9);
      String browser = m.group(10);

      Map<String, byte[]> fields = new HashMap<String, byte[]>();
      fields.put("service", service.getBytes());
      fields.put("client", client.getBytes());
      fields.put("req_result", res.getBytes());
      fields.put("req_size", size.getBytes());

      if (referrer != null && !referrer.equals("-")) {
        fields.put("referrer", referrer.getBytes());
      }

      if (browser != null && !browser.equals("-")) {
        fields.put("browser", browser.getBytes());
      }

      Event e = new EventImpl(body.getBytes(), d.getTime(), Priority.INFO,
          Clock.nanos(), host, fields);
      return e;
    } catch (ParseException e) {
      LOG.warn("Failed to parse apache access log line: '" + s + "'", e);
      return null;
    }

  }
}
TOP

Related Classes of com.cloudera.flume.handlers.text.ApacheAccessLogFormat

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.