Package org.apache.hadoop.hive.ql.io.orc

Source Code of org.apache.hadoop.hive.ql.io.orc.FileDump

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.io.orc;

import java.util.ArrayList;
import java.util.List;

import java.io.IOException;
import java.text.DecimalFormat;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.orc.OrcProto.RowIndex;
import org.apache.hadoop.hive.ql.io.orc.OrcProto.RowIndexEntry;
import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.TruthValue;

/**
* A tool for printing out the file structure of ORC files.
*/
public final class FileDump {
  private static final String ROWINDEX_PREFIX = "--rowindex=";

  // not used
  private FileDump() {}

  public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    List<String> files = new ArrayList<String>();
    List<Integer> rowIndexCols = null;
    for (String arg : args) {
      if (arg.startsWith("--")) {
        if (arg.startsWith(ROWINDEX_PREFIX)) {
          String[] colStrs = arg.substring(ROWINDEX_PREFIX.length()).split(",");
          rowIndexCols = new ArrayList<Integer>(colStrs.length);
          for (String colStr : colStrs) {
            rowIndexCols.add(Integer.parseInt(colStr));
          }
        } else {
          System.err.println("Unknown argument " + arg);
        }
      } else {
        files.add(arg);
      }
    }

    for (String filename : files) {
      System.out.println("Structure for " + filename);
      Path path = new Path(filename);
      Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
      System.out.println("File Version: " + reader.getFileVersion().getName() +
                         " with " + reader.getWriterVersion());
      RecordReaderImpl rows = (RecordReaderImpl) reader.rows();
      System.out.println("Rows: " + reader.getNumberOfRows());
      System.out.println("Compression: " + reader.getCompression());
      if (reader.getCompression() != CompressionKind.NONE) {
        System.out.println("Compression size: " + reader.getCompressionSize());
      }
      System.out.println("Type: " + reader.getObjectInspector().getTypeName());
      System.out.println("\nStripe Statistics:");
      Metadata metadata = reader.getMetadata();
      for (int n = 0; n < metadata.getStripeStatistics().size(); n++) {
        System.out.println("  Stripe " + (n + 1) + ":");
        StripeStatistics ss = metadata.getStripeStatistics().get(n);
        for (int i = 0; i < ss.getColumnStatistics().length; ++i) {
          System.out.println("    Column " + i + ": " +
              ss.getColumnStatistics()[i].toString());
        }
      }
      ColumnStatistics[] stats = reader.getStatistics();
      System.out.println("\nFile Statistics:");
      for(int i=0; i < stats.length; ++i) {
        System.out.println("  Column " + i + ": " + stats[i].toString());
      }
      System.out.println("\nStripes:");
      int stripeIx = -1;
      for (StripeInformation stripe : reader.getStripes()) {
        ++stripeIx;
        long stripeStart = stripe.getOffset();
        System.out.println("  Stripe: " + stripe.toString());
        OrcProto.StripeFooter footer = rows.readStripeFooter(stripe);
        long sectionStart = stripeStart;
        for(OrcProto.Stream section: footer.getStreamsList()) {
          System.out.println("    Stream: column " + section.getColumn() +
            " section " + section.getKind() + " start: " + sectionStart +
            " length " + section.getLength());
          sectionStart += section.getLength();
        }
        for (int i = 0; i < footer.getColumnsCount(); ++i) {
          OrcProto.ColumnEncoding encoding = footer.getColumns(i);
          StringBuilder buf = new StringBuilder();
          buf.append("    Encoding column ");
          buf.append(i);
          buf.append(": ");
          buf.append(encoding.getKind());
          if (encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY ||
              encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) {
            buf.append("[");
            buf.append(encoding.getDictionarySize());
            buf.append("]");
          }
          System.out.println(buf);
        }
        if (rowIndexCols != null) {
          RowIndex[] indices = rows.readRowIndex(stripeIx);
          for (int col : rowIndexCols) {
            StringBuilder buf = new StringBuilder();
            buf.append("    Row group index column ").append(col).append(":");
            RowIndex index = null;
            if ((col >= indices.length) || ((index = indices[col]) == null)) {
              buf.append(" not found\n");
              continue;
            }
            for (int entryIx = 0; entryIx < index.getEntryCount(); ++entryIx) {
              buf.append("\n      Entry ").append(entryIx).append(":");
              RowIndexEntry entry = index.getEntry(entryIx);
              if (entry == null) {
                buf.append("unknown\n");
                continue;
              }
              OrcProto.ColumnStatistics colStats = entry.getStatistics();
              if (colStats == null) {
                buf.append("no stats at ");
              } else {
                ColumnStatistics cs = ColumnStatisticsImpl.deserialize(colStats);
                Object min = RecordReaderImpl.getMin(cs), max = RecordReaderImpl.getMax(cs);
                buf.append(" count: ").append(cs.getNumberOfValues());
                buf.append(" min: ").append(min);
                buf.append(" max: ").append(max);
              }
              buf.append(" positions: ");
              for (int posIx = 0; posIx < entry.getPositionsCount(); ++posIx) {
                if (posIx != 0) {
                  buf.append(",");
                }
                buf.append(entry.getPositions(posIx));
              }
            }
            System.out.println(buf);
          }
        }
      }

      FileSystem fs = path.getFileSystem(conf);
      long fileLen = fs.getContentSummary(path).getLength();
      long paddedBytes = getTotalPaddingSize(reader);
      // empty ORC file is ~45 bytes. Assumption here is file length always >0
      double percentPadding = ((double) paddedBytes / (double) fileLen) * 100;
      DecimalFormat format = new DecimalFormat("##.##");
      System.out.println("\nFile length: " + fileLen + " bytes");
      System.out.println("Padding length: " + paddedBytes + " bytes");
      System.out.println("Padding ratio: " + format.format(percentPadding) + "%");
      rows.close();
    }
  }

  private static long getTotalPaddingSize(Reader reader) throws IOException {
    long paddedBytes = 0;
    List<org.apache.hadoop.hive.ql.io.orc.StripeInformation> stripes = reader.getStripes();
    for (int i = 1; i < stripes.size(); i++) {
      long prevStripeOffset = stripes.get(i - 1).getOffset();
      long prevStripeLen = stripes.get(i - 1).getLength();
      paddedBytes += stripes.get(i).getOffset() - (prevStripeOffset + prevStripeLen);
    }
    return paddedBytes;
  }
}
TOP

Related Classes of org.apache.hadoop.hive.ql.io.orc.FileDump

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.