Package net.yacy.kelondro.data.meta

Examples of net.yacy.kelondro.data.meta.DigestURI.toNormalform()


            long rn = -1;
            while (!ranked.isEmpty() && (entry = ranked.takeURL(false, 1000)) != null) {
                if ((entry == null) || (entry.metadata() == null)) continue;
                url = entry.metadata().url();
                if (url == null) continue;
                us = url.toNormalform(false, false);
                if (rn == -1) rn = entry.ranking();
                prop.put("genUrlList_urlList_"+i+"_urlExists", "1");
                prop.put("genUrlList_urlList_"+i+"_urlExists_urlhxCount", i);
                prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlhxValue", entry.word().urlhash());
                prop.putHTML("genUrlList_urlList_"+i+"_urlExists_keyString", keystring);
View Full Code Here


        // we user the SolrCell design as index scheme
        final SolrInputDocument solrdoc = new SolrInputDocument();
        final DigestURI digestURI = new DigestURI(yacydoc.dc_source());
        addSolr(solrdoc, "failreason_t", ""); // overwrite a possible fail reason (in case that there was a fail reason before)
        addSolr(solrdoc, "id", id);
        addSolr(solrdoc, "sku", digestURI.toNormalform(true, false), 3.0f);
        final InetAddress address = Domains.dnsResolve(digestURI.getHost());
        if (address != null) addSolr(solrdoc, "ip_s", address.getHostAddress());
        if (digestURI.getHost() != null) addSolr(solrdoc, "host_s", digestURI.getHost());
        addSolr(solrdoc, "title", yacydoc.dc_title());
        addSolr(solrdoc, "author", yacydoc.dc_creator());
View Full Code Here

        final List<MediaSnippet> result = new ArrayList<MediaSnippet>();
        while (i.hasNext()) {
            entry = i.next();
            url = new DigestURI(entry.getKey());
            desc = entry.getValue();
            final int ranking = removeAppearanceHashes(url.toNormalform(false, false), queryhashes).size() +
                           removeAppearanceHashes(desc, queryhashes).size();
            if (ranking < 2 * queryhashes.size()) {
                result.add(new MediaSnippet(mediatype, url, MimeTable.url2mime(url), desc, document.getTextLength(), null, ranking, source));
            }
        }
View Full Code Here

            if (u.indexOf(".ico") >= 0 || u.indexOf("favicon") >= 0) continue;
            if (ientry.height() > 0 && ientry.height() < 32) continue;
            if (ientry.width() > 0 && ientry.width() < 32) continue;
            desc = ientry.alt();
            final int appcount = queryhashes.size()  * 2 -
                           removeAppearanceHashes(url.toNormalform(false, false), queryhashes).size() -
                           removeAppearanceHashes(desc, queryhashes).size();
            final long ranking = Long.MAX_VALUE - (ientry.height() + 1) * (ientry.width() + 1) * (appcount + 1);
            result.add(new MediaSnippet(ContentDomain.IMAGE, url, MimeTable.url2mime(url), desc, ientry.fileSize(), ientry.width(), ientry.height(), ranking, source));
        }
        return result;
View Full Code Here

            final DigestURI url = new DigestURI(e.getKey());
            final byte[] urlhash = url.hash();
            if (replace) {
                indexSegment.urlMetadata().remove(urlhash);
                this.nextQueue.urlRemove(urlhash);
                String u = url.toNormalform(true, true);
                if (u.endsWith("/")) {
                    u = u + "index.html";
                } else if (!u.contains(".")) {
                    u = u + "/index.html";
                }
View Full Code Here

                // in case that we want to return the cached content in the next step
                final RequestHeader requestHeader = new RequestHeader();
                requestHeader.put(HeaderFramework.USER_AGENT, ClientIdentification.getUserAgent());
                DigestURI refererURL = null;
                if (request.referrerhash() != null) refererURL = this.sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
                if (refererURL != null) requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true, true));
                final Response response = new Response(
                        request,
                        requestHeader,
                        cachedResponse,
                        "200",
View Full Code Here

            while ((line = reader.readLine()) != null) {
                line = line.trim();
                if (line.length() > 0) {
                    try {
                        final DigestURI url = new DigestURI(line);
                        urls.add(url.toNormalform(true, true));
                    } catch (final MalformedURLException e) {
                        continue;
                    }
                }
                count++;
View Full Code Here

        // create a request header
        final RequestHeader requestHeader = new RequestHeader();
        requestHeader.put(HeaderFramework.USER_AGENT, ClientIdentification.getUserAgent());
        DigestURI refererURL = null;
        if (request.referrerhash() != null) refererURL = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
        if (refererURL != null) requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true, true));
        requestHeader.put(HeaderFramework.ACCEPT_LANGUAGE, sb.getConfig("crawler.http.acceptLanguage", DEFAULT_LANGUAGE));
        requestHeader.put(HeaderFramework.ACCEPT_CHARSET, sb.getConfig("crawler.http.acceptCharset", DEFAULT_CHARSET));
        requestHeader.put(HeaderFramework.ACCEPT_ENCODING, sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING));

        // HTTP-Client
View Full Code Here

        if (!url.getProtocol().equals("file")) throw new IOException("wrong loader for FileLoader: " + url.getProtocol());

        RequestHeader requestHeader = new RequestHeader();
        if (request.referrerhash() != null) {
            DigestURI ur = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
            if (ur != null) requestHeader.put(RequestHeader.REFERER, ur.toNormalform(true, false));
        }
       
        // process directories: transform them to html with meta robots=noindex (using the ftpc lib)
        String[] l = null;
        try {l = url.list();} catch (IOException e) {}
View Full Code Here

        if (!url.getProtocol().equals("smb")) throw new IOException("wrong loader for SMBLoader: " + url.getProtocol());

        RequestHeader requestHeader = new RequestHeader();
        if (request.referrerhash() != null) {
            DigestURI ur = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
            if (ur != null) requestHeader.put(RequestHeader.REFERER, ur.toNormalform(true, false));
        }
       
        // process directories: transform them to html with meta robots=noindex (using the ftpc lib)
        String[] l = null;
        try {l = url.list();} catch (IOException e) {}
View Full Code Here

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.