Package net.yacy.cora.protocol

Examples of net.yacy.cora.protocol.ResponseHeader


                list.add(u + ((u.endsWith("/") || u.endsWith("\\")) ? "" : "/") + s);
            }
        
            StringBuilder content = FTPClient.dirhtml(u, null, null, null, list, true);
           
            ResponseHeader responseHeader = new ResponseHeader();
            responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
            responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
            final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
            Response response = new Response(
                    request,
                    requestHeader,
                    responseHeader,
                    "200",
                    profile,
                    content.toString().getBytes());
           
            return response;
        }
       
        // create response header
        String mime = Classification.ext2mime(url.getFileExtension());
        ResponseHeader responseHeader = new ResponseHeader();
        responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified())));
        responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);
       
        // check mime type and availability of parsers
        // and also check resource size and limitation of the size
        long size;
        try {
            size = url.length();
        } catch (Exception e) {
            size = -1;
        }
        String parserError = null;
        if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) ||
            (size > maxFileSize && maxFileSize >= 0)) {
            // we know that we cannot process that file before loading
            // only the metadata is returned
           
            if (parserError != null) {
                log.logInfo("No parser available in File crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata");
            } else {
                log.logInfo("Too big file in File crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata");
            }
           
            // create response with metadata only
            responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
            final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
            Response response = new Response(
                    request,
                    requestHeader,
                    responseHeader,
View Full Code Here


                list.add(u + s);
            }
        
            StringBuilder content = FTPClient.dirhtml(u, null, null, null, list, true);
           
            ResponseHeader responseHeader = new ResponseHeader();
            responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
            responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
            final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
            Response response = new Response(
                    request,
                    requestHeader,
                    responseHeader,
                    "200",
                    profile,
                    content.toString().getBytes());
           
            return response;
        }
       
        // create response header
        String mime = Classification.ext2mime(url.getFileExtension());
        ResponseHeader responseHeader = new ResponseHeader();
        responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified())));
        responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);
       
        // check mime type and availability of parsers
        // and also check resource size and limitation of the size
        long size;
        try {
            size = url.length();
        } catch (Exception e) {
            size = -1;
        }
        String parserError = null;
        if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) ||
            (size > maxFileSize && maxFileSize >= 0)) {
            // we know that we cannot process that file before loading
            // only the metadata is returned
           
            if (parserError != null) {
                log.logInfo("No parser available in SMB crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata");
            } else {
                log.logInfo("Too big file in SMB crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata");
            }
           
            // create response with metadata only
            responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
            final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
            Response response = new Response(
                    request,
                    requestHeader,
                    responseHeader,
View Full Code Here

                final StringBuilder dirList = ftpClient.dirhtml(path);

                if (dirList == null) {
                    response = null;
                } else {
                    final ResponseHeader responseHeader = new ResponseHeader();
                    responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
                    responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
                    final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
                    response = new Response(
                            request,
                            requestHeader,
                            responseHeader,
View Full Code Here

        final RequestHeader requestHeader = new RequestHeader();
        if (request.referrerhash() != null) {
            final DigestURI refurl = this.sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
            if (refurl != null) requestHeader.put(RequestHeader.REFERER, refurl.toNormalform(true, false));
        }
        final ResponseHeader responseHeader = new ResponseHeader();
        responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(fileDate));
        responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);

        // if the mimetype and file extension is supported we start to download the file
        final long size = ftpClient.fileSize(path);
        responseHeader.put(HeaderFramework.CONTENT_LENGTH, String.valueOf(size));
        String parserError = null;
        if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) ||
            (size > this.maxFileSize && this.maxFileSize >= 0)) {
            // we know that we cannot process that file before loading
            // only the metadata is returned

            if (parserError != null) {
                this.log.logInfo("No parser available in FTP crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata");
            } else {
                this.log.logInfo("Too big file in FTP crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata");
            }

            // create response with metadata only
            responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
            final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
            final Response response = new Response(
                    request,
                    requestHeader,
                    responseHeader,
View Full Code Here

            // statistics:
            if (robotsTxt != null) {
              ByteCount.addAccountCount(ByteCount.CRAWLER, robotsTxt.length);
            }
            final int code = client.getHttpResponse().getStatusLine().getStatusCode();
            final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders());

            // check the response status
            if (code > 199 && code < 300) {
              if (!header.mime().startsWith("text/plain")) {
                    robotsTxt = null;
                    log.info("Robots.txt from URL '" + robotsURL + "' has wrong mimetype '" + header.mime() + "'.");
                } else {

                    // getting some metadata
                  eTag = header.containsKey(HeaderFramework.ETAG)?(header.get(HeaderFramework.ETAG)).trim():null;
                    lastMod = header.lastModified();

                    // if the robots.txt file was not changed we break here
                    if ((eTag != null) && (oldEtag != null) && (eTag.equals(oldEtag))) {
                        if (log.isDebugEnabled()) log.debug("Robots.txt from URL '" + robotsURL + "' was not modified. Abort downloading of new version.");
                        return null;
                    }


                    downloadEnd = System.currentTimeMillis();
                    if (log.isDebugEnabled()) log.debug("Robots.txt successfully loaded from URL '" + robotsURL + "' in " + (downloadEnd-downloadStart) + " ms.");
                }
            } else if (code == 304) {
                return null;
            } else if (code > 299 && code < 400) {
                // getting redirection URL
              String redirectionUrlString = header.get(HeaderFramework.LOCATION);
                if (redirectionUrlString==null) {
                    if (log.isDebugEnabled())
                    log.debug("robots.txt could not be downloaded from URL '" + robotsURL + "' because of missing redirecton header. [" + client.getHttpResponse().getStatusLine() + "].");
                    robotsTxt = null;
                } else {
View Full Code Here

        this.outgoingHeader = outgoingHeader;
    }
   
    public ResponseHeader getOutgoingHeader() {
        if(outgoingHeader == null)
            return new ResponseHeader();
        return outgoingHeader;
    }
View Full Code Here

    public Response(final Request request, final CrawlProfile profile) {
        this.request = request;
        // request and response headers may be zero in case that we process surrogates
        this.requestHeader = new RequestHeader();
        this.responseHeader = new ResponseHeader();
        if (request.size() > 0) this.responseHeader.put(HeaderFramework.CONTENT_LENGTH, Long.toString(request.size()));
        this.responseStatus = "200";
        this.profile = profile;
        this.status = QUEUE_STATE_FRESH;
        this.content = request.url().toTokens().getBytes();
View Full Code Here

        client.setRedirecting(false); // we want to handle redirection ourselves, so we don't index pages twice
        client.setTimout(this.socketTimeout);
        client.setHeader(requestHeader.entrySet());
            // send request
          final byte[] responseBody = client.GETbytes(url, maxFileSize);
          final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders());
          final int code = client.getHttpResponse().getStatusLine().getStatusCode();

          if (code > 299 && code < 310) {
            // redirection (content may be empty)
                if (header.containsKey(HeaderFramework.LOCATION)) {
                    // getting redirection URL
                  String redirectionUrlString = header.get(HeaderFramework.LOCATION);
                    redirectionUrlString = redirectionUrlString.trim();

                    if (redirectionUrlString.length() == 0) {
                        this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection header empy", code);
                        throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " aborted. Location header is empty.");
View Full Code Here

        final HTTPClient client = new HTTPClient();
        client.setTimout(20000);
        client.setHeader(requestHeader.entrySet());
          final byte[] responseBody = client.GETbytes(request.url());
          final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders());
          final int code = client.getHttpResponse().getStatusLine().getStatusCode();
            // FIXME: 30*-handling (bottom) is never reached
            // we always get the final content because httpClient.followRedirects = true

          if (responseBody != null && (code == 200 || code == 203)) {
                // the transfer is ok

            //statistics:
            ByteCount.addAccountCount(ByteCount.CRAWLER, responseBody.length);

                // we write the new cache entry to file system directly

                // create a new cache entry
                response = new Response(
                        request,
                        requestHeader,
                        header,
                        Integer.toString(code),
                        null,
                        responseBody
                );

                return response;
            } else if (code > 299 && code < 310) {
                if (header.containsKey(HeaderFramework.LOCATION)) {
                    // getting redirection URL
                  String redirectionUrlString = header.get(HeaderFramework.LOCATION);
                    redirectionUrlString = redirectionUrlString.trim();

                    if (redirectionUrlString.length() == 0) {
                        throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " aborted. Location header is empty.");
                    }
View Full Code Here

        int c = 0;
        if (result == null) return c;
        // iterate over all images in the result
        final List<MediaSnippet> imagemedia = result.mediaSnippets();
        if (imagemedia != null) {
          ResponseHeader header;
            feedloop: for (final MediaSnippet ms: imagemedia) {
                // check cache to see if the mime type of the image url is correct
                header = Cache.getResponseHeader(ms.href.hash());
                if (header != null) {
                    // this does not work for all urls since some of them may not be in the cache
                    if (header.mime().startsWith("text") || header.mime().startsWith("application")) continue feedloop;
                }
                this.images.put(new ReverseElement<MediaSnippet>(ms, ms.ranking)); // remove smallest in case of overflow
                c++;
                //System.out.println("*** image " + UTF8.String(ms.href.hash()) + " images.size = " + images.size() + "/" + images.size());
            }
View Full Code Here

TOP

Related Classes of net.yacy.cora.protocol.ResponseHeader

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.