/*
 * Scone - The Web Enhancement Framework
 * Copyright (C) 2009 Harald Weinreich, Volkert Buchmann, Frank Wollenweber, Torsten Ha
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
 package scone.robot;


import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Date;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
import java.util.TimeZone;
import java.util.Enumeration;
import java.util.Vector;

import scone.netobjects.HtmlNode;
import scone.netobjects.HtmlNodeCache;
import scone.netobjects.Inclusion;
import scone.netobjects.InclusionCache;
import scone.netobjects.Link;
import scone.netobjects.NetNode;
import scone.netobjects.NetNodeCache;
import scone.netobjects.Server;
import scone.netobjects.ServerCache;
import scone.netobjects.SimpleUri;
import scone.util.DocumentParser;
import scone.util.tokenstream.DummyTokenOutputStream;
import scone.util.tokenstream.TokenInputStreamTokenizerImpl;
import HTTPClient.HTTPResponse;
import HTTPClient.ModuleException;
import HTTPClient.NVPair;


/**
 * Objects of this class do all the work in the crawling process. They try to get an HtmlNode from the database or load a document from the web.<br>
 * Then the classifiers and filters are started. In the last step the links included in the page are put in the urlQueue.
 *
 * @author Frank Wollenweber
 */


class PageLoaderThread implements Runnable {

    private QueueEntry qe; // The queueEntry this pageLoaderThread will process
    private Robot robot; // The robot this page loader thread is working for
    private boolean finished; // True, if the processing of this qe was finished
    private boolean interrupted; // True, if the processing was interrupted
    private boolean sizeLimitReached; // True, if the maximum download size was reached
    private InputStream inputstream = null; // This inputstream is needed to read the document from the server response

    /**
     * Constructor
     * @param robot The robot this page loader thread is working for
     * @param qe The entrie of the urlQueue this pageLoaderThread will process
     */
    PageLoaderThread(Robot robot, QueueEntry qe) {
        this.robot = robot;
        this.qe = qe;
        finished = false;
        interrupted = false;
        sizeLimitReached = false;
    }


    /**
     * Makes a formatted string form an date timestamp
     * @param date Timestamp of a date
     * @return Formatted date
     **/
    private String makeDateString(long date) {
        Date d = new Date(date);
        SimpleDateFormat formatter = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss z", Locale.US);
        formatter.setTimeZone(TimeZone.getDefault());
        String dateString = formatter.format(d);

        return dateString;
    }

    /**
     * Loads a document from the web. The documentParser is then setting all the attributes of the htmlNode
     * @param netNode The url of this htmlNode of specifies which document should be loaded from the web. The links of the document are parsed and all attributes of the htmlNode are set.
     * @param sendIfModifiedSince true, if the if-modified-since field of the request should be used
     * @param lastAccess The last access (user or robot) to the netNode
     */
    private void loadPage(NetNode netNode, boolean sendIfModifiedSince, long lastAccess) {
        RobotHttpConnection con = null;
        LimitedInputStream limitedInputStream = null;
        BufferedReader bufferedreader = null;
        DocumentParser documentParser;
        boolean error = false;

        try {
            String file = netNode.getSUri().getFile();
            String path = netNode.getSUri().getPath();
            String query = netNode.getSUri().getQuery();

            if (query != null && query.length() > 0) {
                file += "?" + query;
            }
            Date d1 = new Date();
            long starttime = d1.getTime();

            // con = new HTTPConnection(qe.getURI().toURL()); // Build up a connection without the httpConnectionPool
            con = robot.getHttpConnectionPool().getHttpConnection(qe.getURI().toURL()); // Build up connection to the host
            con.setTimeout(robot.getTimeout()); // Set timeout
            con.setAllowUserInteraction(false);
            NVPair h[] = new NVPair[2];

            h[0] = new NVPair("User-Agent", robot.getRobotName());
            if (sendIfModifiedSince) {
                h[1] = new NVPair("If-Modified-Since", makeDateString(lastAccess));
            }
            con.setDefaultHeaders(h);
            HTTPResponse rsp = null;

            if (qe.getRobotTask().getHeadOnly()) {
                rsp = con.Head(path + file); // load header from server
            } else {
                rsp = con.Get(path + file); // load page from server
            }
           	// System.out.println(rsp.getStatusCode());
            Date d2 = new Date();
            long endtime = d2.getTime();
            long responsetime = endtime - starttime;
            // Set response time for server
            Server s = ServerCache.get(netNode.getHost());

            if (!con.reUsed()) {
                s.setDelay(String.valueOf(responsetime));
                // System.out.println("Delay " + s.getHost() + " " + responsetime);
            }
            netNode.setAccessStatus(Integer.toString(rsp.getStatusCode()));
            netNode.setLastRobotAccess(d1.getTime());
            if (rsp.getHeader("Content-Type") != null) {
                netNode.setMimeType(rsp.getHeader("Content-Type"));
            }
            if (rsp.getHeader("Content-Length") != null) {
                netNode.setSize(rsp.getHeader("Content-Length"));
            }
            // Extract "Last-Modified" from header...
            // System.out.println(rsp.getHeader("Last-Modified"));
            if (rsp.getHeader("Last-Modified") != null) {
                netNode.setLastModifiedString(rsp.getHeader("Last-Modified"));
            }
            if (rsp.getStatusCode() >= 500) { // Save server error codes to server object.
                s.setAccessStatus(String.valueOf(rsp.getStatusCode()));
            } else {
                s.setAccessStatus("200"); // Server is OK.
            }
            if (rsp.getStatusCode() >= 300) { // evaluate error codes
                error = true;
                System.err.println("Robot - PageLoaderThread -> Received Error: " + rsp.getReasonLine() + " " + netNode.getUri());
            } else {
                if (rsp.getHeader("Content-Type").indexOf("text/html") == -1) {
                    error = true;
                    System.err.println("Robot - PageLoaderThread -> Received Error: Wrong mime-type" + " " + netNode.getUri());
                } else {
                    if (!(qe.getRobotTask().getHeadOnly())) {
                        inputstream = rsp.getInputStream();
                        if (qe.getRobotTask().getMaxPageSize() > 0) {
                            limitedInputStream = new LimitedInputStream(inputstream, qe.getRobotTask().getMaxPageSize());
                            bufferedreader = new BufferedReader(new InputStreamReader(limitedInputStream));
                        } else {
                            bufferedreader = new BufferedReader(new InputStreamReader(inputstream));
                        }
                    }
                }
            }
        } catch (IOException ioe) {
            System.out.println("Robot - PageLoaderThread -> Problems loading the uri " + qe.getURI());
            System.err.println(ioe.toString());
            netNode.setAccessStatus("408"); // By HW: Timeout Error...
        } catch (ModuleException me) {
            System.err.println("Robot - PageLoaderThread -> Error handling request: " + me.getMessage());
        }
        if ((inputstream != null) && (!error)) { // Check, if the inputstream was initialized correctly
            int parserRequirements = scone.Plugin.PARSEDOCUMENT
                    | scone.Plugin.CONSIDERLINKS
                    | scone.Plugin.CONSIDERINCLUSIONS
                    | scone.Plugin.CONSIDERKEYWORDS
                    | scone.Plugin.CALCFINGERPRINT | scone.Plugin.SAVEBODYTEXT;

            if (qe.getRobotTask().getRequireSourceCode()) {
                parserRequirements = parserRequirements
                        | scone.Plugin.SAVESOURCECODE;
            }
            documentParser = new DocumentParser(parserRequirements, false);
            TokenInputStreamTokenizerImpl tokenInputStream = new TokenInputStreamTokenizerImpl(bufferedreader);

            tokenInputStream.getMetaInfo().put("baseNode", netNode);
            DummyTokenOutputStream tokenOutputStream = new DummyTokenOutputStream();

            documentParser.parse(tokenInputStream, tokenOutputStream);
            try {
                tokenInputStream.close();
                tokenOutputStream.close();
            } catch (IOException ioe) {
                System.out.println(ioe.toString());
            }
            sizeLimitReached = ((limitedInputStream != null)
                    && (limitedInputStream.getLimitReached()));
        }
        robot.getHttpConnectionPool().returnHttpConnection(con); // Return the httpConnection to the pool
        qe.getRobotTask().incDownloadedUris();
    }

    /**
     * Runs all pageClassifiers of this robotTask
     * @param robotHtmlNode classifiy this node
     */
    private void runPageClassifier(RobotHtmlNode robotHtmlNode) {
        Enumeration classifier = qe.getRobotTask().getPageClassifier();

        if (classifier != null) {
            while (classifier.hasMoreElements()) {
                PageClassifier cl = (PageClassifier) classifier.nextElement();

                cl.classify(robotHtmlNode, qe);
            }
        }
    }

    /**
     * Runs all pageFilters of this robotTask
     * @param robotHtmlNode filter this node
     * @return True, if the crawling should be continued
     */
    private boolean runPageFilter(RobotHtmlNode robotHtmlNode) {
        boolean result = true;
        Enumeration filter = qe.getRobotTask().getPageFilter();

        if (filter != null) {
            while (filter.hasMoreElements()) {
                PageFilter fi = (PageFilter) filter.nextElement();

                result = result & fi.filter(robotHtmlNode, qe);
            }
        }
        return result;
    }

    /**
     * Runs all linkClassifiers of this robotTask
     * @param robotLink classifiy this link
     */
    private void runLinkClassifier(RobotLink robotLink, RobotHtmlNode robotHtmlNode) {
        Enumeration classifier = qe.getRobotTask().getLinkClassifier();

        if (classifier != null) {
            while (classifier.hasMoreElements()) {
                LinkClassifier cl = (LinkClassifier) classifier.nextElement();

                cl.classify(robotLink, robotHtmlNode, qe);
            }
        }
    }

    /**
     * Runs all linkFilters of this robotTask
     * @param robotLink filter this link
     * @return True, if the crawling should be continued with this link
     */
    private boolean runLinkFilter(RobotLink robotLink, RobotHtmlNode robotHtmlNode) {
        boolean result = true;
        Enumeration filter = qe.getRobotTask().getLinkFilter();

        if (filter != null) {
            while (filter.hasMoreElements()) {
                LinkFilter fi = (LinkFilter) filter.nextElement();

                result = result & fi.filter(robotLink, robotHtmlNode, qe);
            }
        }
        return result;
    }

    /**
     * Starts the classifiers and then decides whether to put a new entry in the queue or not
     * @param link Handle this link
     * @param robotLink The belonging robotLink
     * @robotHtmlNode The link was found in this document
     **/
    private void handleLink(SimpleUri link, RobotLink robotLink, RobotHtmlNode robotHtmlNode) {
        runLinkClassifier(robotLink, robotHtmlNode);
        if ((qe.getRobotTask().getDefaultFilter().filter(link))
                && (runLinkFilter(robotLink, robotHtmlNode))) {
            QueueEntry new_qe = new QueueEntry(link, qe.getDepth() - 1, qe.getRobotTask());
            QueueEntry pending_qe = robot.getPendingURL(link, qe.getRobotTask());

            if (pending_qe != null) {
                if (pending_qe.getDepth() < new_qe.getDepth()) {
                    pending_qe.setDepth(new_qe.getDepth());
                }
            } else {
                QueueEntry open_qe = qe.getRobotTask().getOpenUri(link);

                if (open_qe != null) {
                    if (open_qe.getDepth() < new_qe.getDepth()) {
                        open_qe.setDepth(new_qe.getDepth());
                    }
                } else {
                    RobotHtmlNode downloaded_h = qe.getRobotTask().getResultNode(link);

                    if ((downloaded_h == null)
                            || (downloaded_h.getScannedDepth()
                                    < new_qe.getDepth())) {
                        robot.queue(new_qe);
                        qe.getRobotTask().incQueuedUris();
                    }
                }
            }
        } else {
            // Link filtered
            qe.getRobotTask().incFilteredUris();
        }
    }

    /**
     * Decides whether to put a new entry in the queue or not
     * @param link Handle this inclusion
     * @param inclusion The inclusion object
     **/
    private void handleInclusion(SimpleUri link, Inclusion inclusion) {
        if (qe.getRobotTask().getDefaultFilter().filter(link)) {
            QueueEntry new_qe = new QueueEntry(link, qe.getDepth() - 1, qe.getRobotTask());
            QueueEntry pending_qe = robot.getPendingURL(link, qe.getRobotTask());

            if (pending_qe != null) {
                if (pending_qe.getDepth() < new_qe.getDepth()) {
                    pending_qe.setDepth(new_qe.getDepth());
                }
            } else {
                QueueEntry open_qe = qe.getRobotTask().getOpenUri(link);

                if (open_qe != null) {
                    if (open_qe.getDepth() < new_qe.getDepth()) {
                        open_qe.setDepth(new_qe.getDepth());
                    }
                } else {
                    RobotHtmlNode downloaded_h = qe.getRobotTask().getResultNode(link);

                    if ((downloaded_h == null)
                            || (downloaded_h.getScannedDepth()
                                    < new_qe.getDepth())) {
                        robot.queue(new_qe);
                        qe.getRobotTask().incQueuedUris();
                    }
                }
            }
        } else {
            // Link filtered
            qe.getRobotTask().incFilteredUris();
        }
    }

    /**
     * This method does the processing of one queueEntry.
     */
    private void scan() {
        NetNode netNode = null;
        HtmlNode htmlNode = null;
        RobotHtmlNode robotHtmlNode = null;

        if (qe.getDepth() >= 0) { // Only if the depth is > 0 the processing will continue
            netNode = NetNodeCache.get(qe.getURI()); // Look up, if there is an object in the cache
            if (!qe.getRobotTask().getHeadOnly()) {
                htmlNode = HtmlNodeCache.check(netNode);
            }
            // Build up the downloadCondition
            long lastAccess;

            if (netNode.getLastAccess() > netNode.getLastRobotAccess()) {
                lastAccess = netNode.getLastAccess();
            } else {
                lastAccess = netNode.getLastRobotAccess();
            }
            boolean updateDownloadCondition = (qe.getRobotTask().getUpdateDate()
                            > 0)
                    && (lastAccess > 0)
                    && (lastAccess < qe.getRobotTask().getUpdateDate());
            boolean generalDownloadCondition = false;

            if (qe.getRobotTask().getHeadOnly()) {
                generalDownloadCondition = (!(qe.getRobotTask().getCheckDatabase()))
                        || (netNode.getAccessStatus().equals("-1"));
            } else {
                generalDownloadCondition = (!(qe.getRobotTask().getCheckDatabase()))
                        || (htmlNode == null)
                        || (htmlNode.getNumberOfLinks().equals("-1"))
                        || (!(htmlNode.isCompletelyDownloaded()))
                        || ((qe.getRobotTask().getRequireSourceCode())
                                && (htmlNode.getSourceCode().equals("")))
                        || ((qe.getRobotTask().getDoContenSeenTest())
                                && (htmlNode.getFingerprint().equals("")));
            }
            boolean downloadCondition = (generalDownloadCondition
                    || updateDownloadCondition);

            if (downloadCondition) { // Need to download the page (again) from the web
                if (qe.getRobotTask().getObeyRobotExclusion()) {
                    if (robot.noRobotsTest(htmlNode.getSUri())) {
                        loadPage(netNode, ((!generalDownloadCondition) && updateDownloadCondition), lastAccess);
                    }
                } else {
                    loadPage(netNode, ((!generalDownloadCondition) && updateDownloadCondition), lastAccess);
                }
                if ((qe.getRobotTask().getMaxDownloadUris() > 0)
                        && (qe.getRobotTask().getDownloadedUris()
                                >= qe.getRobotTask().getMaxDownloadUris())) {
                    robot.stopRobotTask(qe.getRobotTask());
                }
            } else {
                // Node found in the cache was sufficient
                qe.getRobotTask().incCacheHits();
            }
            htmlNode = HtmlNodeCache.check(netNode);
            if (htmlNode == null) {
                robotHtmlNode = new RobotHtmlNode(netNode, qe.getDepth());
            } else {
                robotHtmlNode = new RobotHtmlNode(htmlNode, qe.getDepth());
                if (downloadCondition) {
                    if (!(interrupted || sizeLimitReached)) {
                        htmlNode.setCompletelyDownloaded(true); // The htmlNode was downloaded complety, so its attributes like the numer of words are correct
                    } else {
                        htmlNode.setCompletelyDownloaded(false);
                    }
                }
            }
            runPageClassifier(robotHtmlNode);
            // Condition for continuing the scan
            if ((!(htmlNode == null)) && (qe.getDepth() >= 1)
                    && (runPageFilter(robotHtmlNode))
                    && (!qe.getRobotTask().wasStopped())) { // Only if the depth is >=1 the links are put in the urlQueue
                // Handling the links
                Enumeration links = htmlNode.getOutgoingLinks().elements();

                while ((links.hasMoreElements())
                        && (!qe.getRobotTask().wasStopped())) {
                    SimpleUri link = null;
                    RobotLink robotLink = null;

                    try {
                        Link l = (Link) links.nextElement();

                        robotLink = new RobotLink(l);
                        link = l.getToNode().getSUri();
                    } catch (Exception e) {
                        System.out.println("Robot -> PageLoaderThread: Invalid link, e.g. JavaScript");
                        continue; // Skip this links
                    }
                    handleLink(link, robotLink, robotHtmlNode);
                    qe.getRobotTask().incCheckedUris();
                } // while (links.hasMoreElements)
                // Handling the frames
                if (htmlNode.isFrames()) {
                    InclusionCache.clean(); // Store to DB...
                    Vector v = InclusionCache.getIncludedObjects(htmlNode.getNode());
                    Inclusion inclusion = null;
                    SimpleUri link = null;

                    for (int i = 0; i < v.size(); i++) {
                        if (!qe.getRobotTask().wasStopped()) {
                            break;
                        }
                        try {
                            inclusion = (Inclusion) v.elementAt(i);
                            link = inclusion.getChildNode().getSUri();
                        } catch (Exception e) {
                            System.out.println("Robot - PageLoaderThread -> Invalid link, e.g. JavaScript");
                            continue; // Skip this links
                        }
                        handleInclusion(link, inclusion);
                        qe.getRobotTask().incCheckedUris();
                    }
                } // if(htmlNode.isFrames())
            } // if (qe.getDepth>1)
            qe.getRobotTask().moveOpenUriToResult(qe, robotHtmlNode);
            robot.pageFinished(qe, robotHtmlNode);
        } // if(qe.getDepth>0)
    }

    /**
     * This method is called by the PageLoaderInterruptTimer
     **/
    void interrupt() {
        if (!finished) {
            System.out.println("Robot - PageLoaderThread -> interrupted " + qe.getURI().toDocString());
            try {
                interrupted = true;
                inputstream.close();
            } catch (Exception ex) {}
        }
    }

    /**
     * run this thead
     */
    public void run() {
        PageLoaderInterruptTimer pit = new PageLoaderInterruptTimer(this);

        if (qe.getRobotTask().getMaxDownloadTime() != -1) {
            robot.getTimer().schedule(pit, qe.getRobotTask().getMaxDownloadTime());
        }
        scan();
        finished = true;
    }
}
