Package com.btaz.datautil.files

Source Code of com.btaz.datautil.files.FileSplitter

package com.btaz.datautil.files;

import com.btaz.datautil.DataUtilDefaults;
import com.btaz.datautil.DataUtilException;

import java.io.*;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;

/**
* User: msundell
*/
public class FileSplitter {
    private static final long DEFAULT_MAX_BYTES = 10 * 1024 * 1024; // 10 MB RAM, roughly 4.7 MB per file on disk
    private static final long DEFAULT_MIN_BYTES = 10 * 1024;        // 10 KB RAM, roughly 4.7 KB per file on disk
    private static final double LIST_CAPACITY_MULTIPLIER = 1.5D;    // compensate for how ArrayLists allocates capacity
    private static final int ARRAY_LIST_ROW_OVERHEAD = 5;           // compensate for ArrayLists item memory overhead

    /**
     * This method reads an input file and creates file splits if the file size in memory exceeds a certain size.
     * If the split directory is not specified the system specific temp directory will be used. This means that the
     * files will be deleted automatically for you when the JVM terminates. It is however a good idea to delete files
     * when no longer needed.
     * @param splitDir directory to create split files in.
     * @param inputFile the file we want to split into multiple files
     * @param skipHeader skip header, if true then the first row in the input file will be ignored
     * @return <code>List</code> of <code>FilePart</code> items
     */
    public static List<File> split(File splitDir, File inputFile, boolean skipHeader) {
        return split(splitDir, inputFile, DEFAULT_MAX_BYTES, skipHeader);
    }

    /**
     * This method reads an input file and creates file splits if the file size in memory exceeds a certain size.
     * If the split directory is not specified the system specific temp directory will be used. This means that the
     * files will be deleted automatically for you when the JVM terminates. It is however a good idea to delete files
     * when no longer needed.
     * @param splitDir directory to create split files in.
     * @param inputFile the file we want to split into multiple files
     * @param maxBytes the maximum in memory size a file is allowed to occupy (guesstimate). 10MB = 10 x 1024 x 1024
     * @param skipHeader skip header, if true then the first row in the input file will be ignored
     * @return <code>List</code> of <code>FilePart</code> items
     */
    public static List<File> split(File splitDir, File inputFile, long maxBytes, boolean skipHeader) {
        // validations
        if(splitDir == null) {
            throw new DataUtilException("The split directory parameter can not be a null value");
        } else if(!splitDir.exists()) {
            throw new DataUtilException("The split directory is not a valid path");
        }
        if(inputFile == null) {
            throw new DataUtilException("The input file parameter can not be a null value");
        } else if(!inputFile.exists()) {
            throw new DataUtilException("The input file doesn't exist");
        }
        if(maxBytes < DEFAULT_MIN_BYTES) {
            throw new DataUtilException("Invalid max bytes specification: " + maxBytes + ", minimum is: "
                    + DEFAULT_MIN_BYTES);
        }

        ArrayList<File> splitFiles = new ArrayList<File>();

        // open file
        FileInputStream inputStream = null;
        LinkedList<String> rows = new LinkedList<String>();
        try {
            // create FilePart
            inputStream = new FileInputStream(inputFile);
            BufferedReader br = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));
            long charsTotal = 0;
            String line;

            int maxChunk = 60;
            int rowNumber = 0;
            while((line=br.readLine()) != null) {
                rowNumber += 1;
                if(skipHeader && rowNumber == 1) {
                    continue;
                }
                if(storageCalculation(rows.size(), line.length(), charsTotal) > maxBytes) {
                    if(maxChunk-- <= 0) {
                        break;
                    }
                    // we have to stop or we may exceed max part size
                    File splitFile = writePartToFile(splitDir, rows);
                    splitFiles.add(splitFile);
                    rows.clear();
                    charsTotal = 0;
                }
                rows.add(line);
                charsTotal += line.length() + DataUtilDefaults.lineTerminator.length();
            }
            // no more data, finish up the final part
            if(charsTotal > 0) {
                File splitFile = writePartToFile(splitDir, rows);
                splitFiles.add(splitFile);
            }

            inputStream.close();
            inputStream = null;

        } catch (FileNotFoundException e) {
            throw new DataUtilException(e);
        } catch (UnsupportedEncodingException e) {
            throw new DataUtilException(e);
        } catch (IOException e) {
            throw new DataUtilException(e);
        } finally {
            if(inputStream != null) {
                try {
                    inputStream.close();
                } catch (IOException e) {
                    // intentionally do nothing
                }
            }
        }

        return splitFiles;
    }

    /**
     * This method writes a partial input file to a split file
     * @param splitDir split file directory
     * @param data data
     * @return <code>File</code> new split file
     */
    private static File writePartToFile(File splitDir, List<String> data) {
        BufferedWriter writer = null;
        File splitFile;
        try {
            splitFile = File.createTempFile("split-", ".part", splitDir);
            writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(splitFile.getAbsoluteFile(), false),
                    DataUtilDefaults.charSet));
            for(String item : data) {
                writer.write(item + DataUtilDefaults.lineTerminator);
            }
            writer.flush();
            writer.close();
            writer = null;
        } catch (UnsupportedEncodingException e) {
            throw new DataUtilException(e);
        } catch (FileNotFoundException e) {
            throw new DataUtilException(e);
        } catch (IOException e) {
            throw new DataUtilException(e);
        } finally {
            if(writer != null) {
                try {
                    writer.close();
                } catch (IOException e) {
                    // Intentionally we're doing nothing here
                }
            }
        }
        return splitFile;
    }

    /**
     * This calculation attempts to calculate how much storage x number of rows may use in an ArrayList assuming that
     * capacity may be up to 1.5 times the actual space used.
     * @param rows rows
     * @param lineChars the amount of new character data we want to add
     * @param totalCharacters total amount of characters already stored
     * @return long a guesstimate for storage required
     */
    private static long storageCalculation(int rows, int lineChars, long totalCharacters) {
        long size = (long) Math.ceil((rows* ARRAY_LIST_ROW_OVERHEAD + lineChars + totalCharacters) * LIST_CAPACITY_MULTIPLIER);
        return size;
    }
}
TOP

Related Classes of com.btaz.datautil.files.FileSplitter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.