Package com.socrata.datasync.publishers

Source Code of com.socrata.datasync.publishers.DeltaImporter2Publisher

package com.socrata.datasync.publishers;

import com.socrata.datasync.SizeCountingInputStream;
import com.socrata.datasync.Utils;
import com.socrata.datasync.HttpUtility;
import com.socrata.datasync.config.controlfile.ControlFile;
import com.socrata.datasync.config.userpreferences.UserPreferences;
import com.socrata.datasync.deltaimporter2.*;
import com.socrata.datasync.job.JobStatus;
import com.socrata.ssync.PatchComputer;
import com.socrata.ssync.SignatureComputer;
import com.socrata.ssync.SignatureTable;
import com.socrata.ssync.exceptions.input.InputException;
import com.socrata.ssync.exceptions.signature.SignatureException;
import org.apache.commons.io.IOUtils;
import org.apache.http.*;
import org.apache.http.client.entity.EntityBuilder;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.StringEntity;
import org.codehaus.jackson.JsonGenerationException;
import org.codehaus.jackson.JsonProcessingException;
import org.codehaus.jackson.map.DeserializationConfig;
import org.codehaus.jackson.map.ObjectMapper;

import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.security.NoSuchAlgorithmException;
import java.text.ParseException;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;

public class DeltaImporter2Publisher implements AutoCloseable {

    private static final String datasyncBasePath = "/datasync";
    private static final String datasyncPath = datasyncBasePath + "/id";
    private static final String statusPath = "/status";
    private static final String commitPath = "/commit";
    private static final String logPath = "/log";
    private static final String ssigContentType = "application/x-socrata-ssig";
    private static final String patchExtenstion = ".sdiff";
    private static final String compressionExtenstion = ".xz";
    private static final String finishedLogKey = "finished";
    private static final String committingLogKey = "committing-job";
    private static final String committedLogKey = "committed-job";
    private static final int httpRetries = 3;
    private static final int defaultChunkSize = 1024 * 4000;

    private static class CompletelyRestartJob extends Exception {}

    private static String domain;
    private static HttpUtility http;
    private static URIBuilder baseUri;
    private ObjectMapper mapper = new ObjectMapper().enable(DeserializationConfig.Feature.ACCEPT_SINGLE_VALUE_AS_ARRAY);
    private String pathToSignature = null;
    CloseableHttpResponse signatureResponse = null;

    public DeltaImporter2Publisher(UserPreferences userPrefs) {
        http = new HttpUtility(userPrefs, true);
        domain = userPrefs.getHost();
        baseUri = new URIBuilder()
                .setScheme("https")
                .setHost(domain);
    }

    @Override
    public void close() throws IOException {
        http.close();
    }

    /**
     * Publishes a complete csv or tsv file using delta-importer-2 over http.
     * @param datasetId the 4x4 of the dataset to be replaced
     * @param csvOrTsvFile the csv or tsv file that is to replace the dataset with the given 4x4
     * @param controlFile the control file used to specialize the resulting dataset
     * @return a job status indicating success or failure
     */
    public JobStatus publishWithDi2OverHttp(String datasetId, final File csvOrTsvFile, ControlFile controlFile) throws
            IOException {

        System.out.println("Publishing " + csvOrTsvFile.getName() + " via delta-importer-2 over HTTP");
        DatasyncDirectory datasyncDir = new DatasyncDirectory(http, domain, datasetId);
        boolean useCompression = true;
        InputStream previousSignature = null;
        SizeCountingInputStream patch = null;
        int chunkSize = fetchDatasyncChunkSize();
        String uuid = controlFile.generateAndAddOpaqueUUID();
        int retryCount = 0;

        do {
            try {
                // get signature of previous csv/tsv file
                pathToSignature = datasyncDir.getPathToSignature();
                previousSignature = getPreviousSignature(pathToSignature);

                final long fileSize = csvOrTsvFile.length();
                InputStream progressingInputStream = new ProgressingInputStream(new FileInputStream(csvOrTsvFile)) {
                    @Override
                    protected void progress(long count) {
                        System.out.println("\tRead " + count + " of " + fileSize + " bytes of " + csvOrTsvFile.getName());
                    }
                };
                // compute the patch between the csv/tsv file and its previous signature
                patch = new SizeCountingInputStream(getPatch(progressingInputStream, previousSignature, chunkSize, useCompression));

                // post the patch file in blobby chunks - ewww
                List<String> blobIds = postPatchBlobs(patch, datasetId, chunkSize);

                // commit the chunks, thereby applying the diff
                CommitMessage commit = new CommitMessage()
                        .filename(csvOrTsvFile.getName() + patchExtenstion + (useCompression ? compressionExtenstion : ""))
                        .relativeTo(pathToSignature)
                        .chunks(blobIds)
                        .control(controlFile)
                        .expectedSize(patch.getTotal());
                String jobId = commitBlobPostings(commit, datasetId, uuid);

                // return status
                return getJobStatus(datasetId, jobId);
            } catch (CompletelyRestartJob e) {
                retryCount += 1;
            } catch (ParseException | NoSuchAlgorithmException | InputException | URISyntaxException |
                    SignatureException |InterruptedException | HttpException e) {
                e.printStackTrace();
                JobStatus jobStatus = JobStatus.PUBLISH_ERROR;
                jobStatus.setMessage(e.getMessage());
                return jobStatus;
            } finally {
                if (previousSignature != null) { previousSignature.close(); }
                if (patch != null) { patch.close(); }
                if (signatureResponse != null) { signatureResponse.close(); }
            }
        } while(retryCount < httpRetries);
        JobStatus jobStatus = JobStatus.PUBLISH_ERROR;
        jobStatus.setMessage("Couldn't get the request through; too many retries"); // TODO Better message
        return jobStatus;
    }


    /**
     * Returns an input stream to the signature of the previous version of the dataset to be replaced
     * NOTE: this has the side-effect of setting pathToSignature to null if the previous signature cannot be gotten.
     * @param signaturePath the complete path to the signature; i.e. /datasync/id/some-4by4/completed/... or null
     * @return an input stream to the previous signature (possibly the null signature)
     */
    private InputStream getPreviousSignature(String signaturePath) throws
            IOException, URISyntaxException, NoSuchAlgorithmException {
        System.out.println("Acquiring the previous signature or creating a null signature");
        if(signaturePath == null) {
            // if no previously completed signature, return stream of the null signature
            return getNullSignature();
        } else {
            // otherwise, return the completed signature stored by delta-importer-2
            URI absolutePath = baseUri.setPath(signaturePath).build();
            signatureResponse = http.get(absolutePath, ssigContentType);
            int statusCode = signatureResponse.getStatusLine().getStatusCode();
            if (statusCode == HttpStatus.SC_OK || statusCode == HttpStatus.SC_NOT_MODIFIED) {
                return signatureResponse.getEntity().getContent();
            } else {
                // if we have a problem getting the signature, we can still use the null signature,
                // but must set the pathToSignature to null for the relativeTo field in the commit
                pathToSignature = null;
                return getNullSignature();
            }
        }
    }


    /**
     * Computes the diff of the csv or tsv file with the most recent completed signature if there is
     * one, else with nothing
     * @param newFile an input stream to the new file that is to replace the old
     * @param previousSignature an input stream to the previous signature
     * @param compress whether to compress the patch using xz compression
     * @return an input stream containing the possibly compressed patch
     */
    private InputStream getPatch(InputStream newFile, InputStream previousSignature, int chunkSize, boolean compress) throws
            SignatureException, IOException, InputException, NoSuchAlgorithmException {
        System.out.println("Calculating the diff between the source file and previous signature");
        BufferedInputStream newStream = new BufferedInputStream(newFile);
        BufferedInputStream previousStream = new BufferedInputStream(previousSignature);
        if (!compress) {
            return new PatchComputer.PatchComputerInputStream(newStream, new SignatureTable(previousStream), "MD5", 102400);
        } else {
            InputStream patchStream = new PatchComputer.PatchComputerInputStream(newStream, new SignatureTable(previousStream), "MD5", 1024000);
            return new XZCompressInputStream(patchStream, 2 * chunkSize);
        }
    }

    private int fetchDatasyncChunkSize() {
        URI versionServicePath;
        int retryCount = 0;

        try {
            versionServicePath = baseUri.setPath(datasyncBasePath + "/version.json").build();
        } catch (URISyntaxException e) {
            System.out.println("Couldn't construct version.json URI?  Using " + defaultChunkSize + " for the chunk-size");
            return defaultChunkSize;
        }

        while(true) {
            try(CloseableHttpResponse response = http.get(versionServicePath, ContentType.APPLICATION_JSON.getMimeType())) {
                return mapper.readValue(response.getEntity().getContent(), Version.class).maxBlockSize;
            } catch (Exception e) {
                retryCount += 1;
                if(retryCount >= httpRetries) {
                    // The next step will probably fail too if we can't even hit DI's version service,
                    // but might as well soldier on regardless.
                    System.out.println("Unable to detect chunk size; using " + defaultChunkSize);
                    return defaultChunkSize;
                }
            }
        }
    }

    /**
     * Chunks up the signature patch file into ~4MB chunks and posts these to delta-importer-2
     * @param patchStream an inputStream to the patch
     * @param datasetId the 4x4 of the dataset being patched
     * @return the list of blobIds corresponding to each successful post
     */
    private List<String> postPatchBlobs(InputStream patchStream, String datasetId, int chunkSize) throws
            IOException, URISyntaxException, HttpException {
        System.out.println("Chunking and posting the diff");

        URI postingPath = baseUri.setPath(datasyncPath + "/" + datasetId).build();
        List<String> blobIds = new LinkedList<>();
        byte[] bytes = new byte[chunkSize];
        StatusLine statusLine;
        int status;
        int bytesRead;

        while ((bytesRead = Utils.readChunk(patchStream, bytes, 0, bytes.length)) != -1) {
            System.out.println("\tUploading " + bytesRead + " bytes of the diff");
            byte[] chunk = bytesRead == bytes.length ? bytes : Arrays.copyOf(bytes, bytesRead);
            HttpEntity entity = EntityBuilder.create().setBinary(chunk).build();
            int retries = 0;
            do {
                try(CloseableHttpResponse response = http.post(postingPath, entity)) {
                    statusLine = response.getStatusLine();
                    status = statusLine.getStatusCode();
                    if (status != HttpStatus.SC_CREATED) {
                        retries += 1;
                    } else {
                        String blobId = mapper.readValue(response.getEntity().getContent(), BlobId.class).blobId;
                        blobIds.add(blobId);
                    }
                }
            } while (status != HttpStatus.SC_CREATED && retries < httpRetries);
            //We hit the max number of retries without success and should throw an exception accordingly.
            if (retries >= httpRetries) throw new HttpException(statusLine.toString());
            System.out.println("\tUploaded " + bytesRead + " bytes");
        }
        return blobIds;
    }


    /**
     * Commits the blobs that were posted.
     * @param msg a message containing the necessary info for applying the patch to the most recent signature file
     * @param datasetId the 4x4 of the dataset to which the blobs belong
     * @return the jobId of the job applying the diff
     */
    private String commitBlobPostings(final CommitMessage msg, final String datasetId, final String uuid) throws URISyntaxException, IOException, CompletelyRestartJob {
        System.out.println("Commiting the chunked diffs to apply the patch");
        final URI committingPath = baseUri.setPath(datasyncPath + "/" + datasetId + commitPath).build();

        // This is kinda ugly (kinda?) -- since this request isn't idempotent, we're handling retry logic ourselves
        // with checks to make sure that the request didn't actually go through on a failure.
        class PostStateMachine {
            String jobId = null;
            IOException lastPostException = null;
            int retries = 0;

            void go() throws IOException, URISyntaxException, CompletelyRestartJob {
                StringEntity entity = new StringEntity(mapper.writeValueAsString(msg), ContentType.APPLICATION_JSON);
                try (CloseableHttpResponse response = doPost(entity)) {
                    if(response == null) handleIOErrorPath();
                    else handleHttpResponsePath(response);
                }
            }

            CloseableHttpResponse doPost(HttpEntity entity) {
                try {
                    return http.post(committingPath, entity);
                } catch (IOException e) {
                    lastPostException = e;
                    return null;
                }
            }

            void handleIOErrorPath() throws IOException, URISyntaxException {
                checkCommitment();
                if(retries >= httpRetries) throw lastPostException;
            }

            void checkCommitment() throws URISyntaxException {
                Commital commital = getJobCommitment(datasetId, uuid);
                switch(commital.status) {
                    case COMMITTED:
                        jobId = commital.jobId;
                        break;
                    case COMMITTING:
                        retries += httpRetries-1;   // we only retry this case once, since this is typically fatal
                        break;
                    case NOT_COMMITTING:
                    case UNKNOWN:
                        retries++;
                }
            }

            void handleHttpResponsePath(CloseableHttpResponse response) throws IOException, URISyntaxException, CompletelyRestartJob {
                StatusLine statusLine = response.getStatusLine();
                int status = statusLine.getStatusCode();
                if (status != HttpStatus.SC_OK) {
                    DI2Error error = null;
                    if(status == HttpStatus.SC_BAD_REQUEST) {
                        try {
                            error = mapper.readValue(response.getEntity().getContent(), DI2Error.class);
                        } catch (JsonProcessingException e) {
                            // Bad request response which wasn't from DI2, and hence not JSON or not in the
                            // format we require?
                        } catch (IOException e) {
                            lastPostException = e;
                            handleIOErrorPath();
                            return;
                        }
                    }
                    response.close();

                    if(error == null) {
                        // it wasn't a failure due to something we sent (or if it was, it's a cause
                        // we don't know about!), so let's see if the commitment went through anyway.
                        checkCommitment();
                    } else {
                        switch(error.type) {
                            case NON_UNIFORM_CHUNK:
                            case NONEXISTANT_CHUNK:
                            case SIZE_MISMATCH:
                                // these can all be plausibly caused by errors in DataSync's network handling
                                // (non-uniform chunk less so than the others) so we'll retry.
                                // TODO: report this to Socrata!
                                throw new CompletelyRestartJob();
                            case FILENAME_BAD_CHARACTERS:
                            case FILENAME_TOO_LONG:
                                // These are caused by the human and won't be fixed by retrying.  Bad human, no cookie!
                                // But we should give them a human-readable error at least (preferably we should prevent
                                // it from happening in the first place).  That's a TODO
                                //
                                // For now, fall through to the "terminate processing" case.
                            default:
                                // anything else is a bug in datasync.  TODO: report this to Socrata!
                                retries = httpRetries;
                        }
                    }
                    if(retries >= httpRetries) throw new IOException("Unable to commit job. " + statusLine.toString());
                } else {
                    jobId = mapper.readValue(response.getEntity().getContent(), JobId.class).jobId;
                }
            }
        }
        PostStateMachine psm = new PostStateMachine();
        do {
            psm.go();
        } while (psm.jobId == null);
        return psm.jobId;
    }

    /**
     * Requests the job status for the given jobId associated with given datasetId
     * @param datasetId the 4x4 of the dataset which was (to be) replaced
     * @param jobId the jobId returned from a succesful commit post
     * @return either success or a publish error
     */
    private JobStatus getJobStatus(String datasetId, String jobId) throws
            URISyntaxException, IOException, InterruptedException, HttpException {
        JobStatus jobStatus = null;
        String status = null;
        StatusLine statusLine = null;
        URI statusUri = baseUri.setPath(datasyncPath + "/" + datasetId + statusPath + "/" + jobId).build();
        URI logUri = baseUri.setPath(datasyncPath + "/" + datasetId + logPath + "/" + jobId + ".json").build();
        int retries = 0;
        while (jobStatus == null && retries < httpRetries) {
            try(CloseableHttpResponse response = http.get(statusUri, ContentType.APPLICATION_JSON.getMimeType())) {
                statusLine = response.getStatusLine();
                int statusCode = statusLine.getStatusCode();
                if (statusCode == HttpStatus.SC_OK) {
                    status = IOUtils.toString(response.getEntity().getContent());
                    System.out.print("Polling the job status: " + status);
                    if (status.startsWith("SUCCESS")) {
                        jobStatus = JobStatus.SUCCESS;
                    } else if (status.startsWith("FAILURE")) {
                        jobStatus = JobStatus.PUBLISH_ERROR;
                    } else {
                        Thread.sleep(1000);
                    }
                } else if (statusCode != HttpStatus.SC_NOT_MODIFIED) {
                    retries += 1;
                    Thread.sleep(1000);
                }
            }
        }
        if (jobStatus == null) {
            throw new HttpException(statusLine.toString());
        }
        jobStatus.setMessage(status + "(jobId:" + jobId + ")");
        if(!jobStatus.isError()) loadStatusWithCRUD(jobStatus, logUri);
        return jobStatus;
    }

    private Commital getJobCommitment(String datasetId, String uuid) throws URISyntaxException {
        URI logUri = baseUri.setPath(datasyncPath + "/" + datasetId + logPath + "/index.json").build();
        try(CloseableHttpResponse response = http.get(logUri, ContentType.APPLICATION_JSON.getMimeType())) {
            int statusCode = response.getStatusLine().getStatusCode();
            if (statusCode == HttpStatus.SC_OK) {
                LogItem[] log = mapper.readValue(response.getEntity().getContent(), LogItem[].class);
                for (int i = 0; i < log.length; i++) {
                    LogItem item = log[i];
                    if (item.type.equalsIgnoreCase(committingLogKey)) {
                        if (uuid.equals(item.getOpaqueUUID())) {
                            String jobId = item.getJobId();
                            Commitment c = isJobCommitted(datasetId, jobId);
                            return new Commital(c, jobId);
                        }
                    }
                }
                return new Commital(Commitment.NOT_COMMITTING, null);
            } else {
                System.err.println("Unable to determine if job was committed from logs");
            }
        } catch (IOException e) {
            System.err.println("Unable to determine if job was committed from logs. " + e.getMessage());
        }
        return new Commital(Commitment.UNKNOWN, null);
    }

    private Commitment isJobCommitted(String datasetId, String jobId) throws URISyntaxException {
        URI logUri = baseUri.setPath(datasyncPath + "/" + datasetId + logPath + "/" + jobId + ".json").build();
        try(CloseableHttpResponse response = http.get(logUri, ContentType.APPLICATION_JSON.getMimeType())) {
            int statusCode = response.getStatusLine().getStatusCode();
            if (statusCode == HttpStatus.SC_OK) {
                LogItem[] log = mapper.readValue(response.getEntity().getContent(), LogItem[].class);
                LogItem committed = getLogItem(log, committedLogKey);
                if (committed != null) {
                    return Commitment.COMMITTED;
                } else {
                    return Commitment.COMMITTING;
                }
            } else {
                System.err.println("Unable to determine if job was committed from logs");
            }
        } catch (IOException e) {
            System.err.println("Unable to determine if job was committed from logs. " + e.getMessage());
        }
        return Commitment.UNKNOWN;
    }


    private void loadStatusWithCRUD(JobStatus status, URI logUri) {
        try(CloseableHttpResponse response = http.get(logUri, ContentType.APPLICATION_JSON.getMimeType())) {
            int statusCode = response.getStatusLine().getStatusCode();
            if (statusCode == HttpStatus.SC_OK) {
                LogItem[] deltaLog = mapper.readValue(response.getEntity().getContent(), LogItem[].class);
                LogItem deltas = getLogItem(deltaLog, finishedLogKey);
                if (deltas != null) {
                    status.rowsCreated = deltas.getInserted();
                    status.rowsUpdated = deltas.getUpdated();
                    status.rowsDeleted = deltas.getDeleted();
                    status.errors = deltas.getErrors();
                }
            } else {
                System.err.println("Unable to parse out CRUD details from logs");
            }
        } catch (IOException e) {
            System.err.println("Unable to parse out CRUD details from logs");
        }
    }

    private InputStream getNullSignature() throws IOException, NoSuchAlgorithmException {
        BufferedInputStream nullStream = new BufferedInputStream(new ByteArrayInputStream("" .getBytes()));
        return new SignatureComputer.SignatureFileInputStream("MD5", "MD5", 10240, nullStream);
    }

    private LogItem getLogItem(LogItem[] logItems, String key) {
        for (int i = 0; i < logItems.length; i++) {
            LogItem item = logItems[i];
            if (item.type.equalsIgnoreCase(key))
                return item;
        }
        return null;
    }

    private enum Commitment {
        NOT_COMMITTING, COMMITTING, COMMITTED, UNKNOWN;
    }

    private static class Commital {
        public Commitment status;
        public String jobId;

        public Commital(Commitment c, String id) {
            this.status = c;
            this.jobId = id;
        }
    }
}
TOP

Related Classes of com.socrata.datasync.publishers.DeltaImporter2Publisher

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.