/*
 * Scone - The Web Enhancement Framework
 * Copyright (C) 2009 Harald Weinreich, Volkert Buchmann, Frank Wollenweber, Torsten Ha
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
 package scone.robot;


import java.util.Date;
import java.util.Enumeration;
import java.util.Vector;

import scone.netobjects.SimpleUri;


/**
 * RobotTask classes are used to define tasks for the robot.
 * Use the constructor to get a task and set the basic properties
 *
 * @author Frank Wollenweber
 */


public class RobotTask {

    private int id; // The unique id of this task
    private RobotUser robotUser; // The user of the robot

    // These variables define properties of the task
    private SimpleUri startURI; // Start with this url
    private boolean headOnly; // Download only the head of the startURI
    private int depth; // Scan this depth. 0 means only download the url.
    private int restriction; // Only internal links, only same subdirectory, every link.
    private boolean obeyRobotExclusion; // The robot should obey the robot exclusion standard
    private long expiry; // This ist the delay for the expiry of this task
    private int maxDownloadUris; // MaxUris are downloaded in the scanning process.
    private boolean checkDatabase; // The robots checks the database before downloading pages from the web
    private long updateDate; // Objects in the database older then updateDate are loaded again from the web
    private int maxPageSize; // MaxPageSize bytes of the page are downloaded
    private long maxDownloadTime; // The Maximum milliseconds a pageLoaderThread tries to download a page
    private Vector pageClassifierList; // The page classifiers
    private Vector linkClassifierList; // The link classifier
    private Vector pageFilterList; // The page filter
    private Vector linkFilterList; // The link filter
    private DefaultFilter defaultFilter; // The default filter
    private boolean doContentSeenTest; // If true, a fingerprint is used to check, if the content of the current page has been seen at a differnt url during the crawl
    private boolean requireSourceCode; // If true, the DocumentParser stores the source in the database

    // These values are set during the processing of the task
    private long arrivalTime; // Arrival time of this task
    private long startTime; // Start time of this task
    private long endTime; // End time of this task
    private int checkedUris; // How many URIs are checked ...
    private int queuedUris; // ... and how many of them are put in the queue
    private int filteredUris; // ... and how many of them are filtered
    private int downloadedUris; // How many of the urls put in the queue are downloaded...
    private int cacheHits; // ... and how many of them are found in the cache
    private Vector resultNodes; // The result of the crawl
    private Vector openUris; // Theses urls are open now
    private int openThreads; // Number of threads currently handling this task
    private boolean stopped; // True, if the task was stopped unfinished
    private boolean finished; // True, if the robot finished this task;

    // constants
    public final static int INTERNAL = 1; /** Follow only internal links */
    public final static int SUBDIRECTORIES = 2; /** Follow only links that point to files in the same subdirectory */
    public final static int EXTERNAL = 3; /** Follow only external links */
    public final static int ALL = 4; /** Follow all links */

    // lock-object
    private Object statisticsLock; // This is used for synchronisation

    /**
     * constructor
     * @param startUri start the crawl from at this uri
     * @param depth follow the links with this depth
     * @param restriction use the constants defined in this class to restrict the crawling process
     * @param robotUser the robotUser will be called for every found document and at the end of the crawling
     */
    public RobotTask(SimpleUri startURI, int depth, int restriction, RobotUser robotUser) {
        // Parameter of the robotTask
        id = Robot.getNextRobotTaskId();
        this.robotUser = robotUser;
        this.startURI = startURI;
        headOnly = false;
        this.depth = depth;
        this.restriction = restriction;
        obeyRobotExclusion = false;
        expiry = -1;
        maxDownloadUris = -1;
        checkDatabase = true;
        updateDate = -1;
        maxPageSize = -1;
        maxDownloadTime = -1;
        pageClassifierList = new Vector();
        linkClassifierList = new Vector();
        pageFilterList = new Vector();
        linkFilterList = new Vector();
        defaultFilter = new DefaultFilter(restriction, startURI);
        doContentSeenTest = false;
        requireSourceCode = true;

        // Crawling results and statistics
        arrivalTime = -1;
        startTime = -1;
        endTime = -1;
        checkedUris = 0;
        queuedUris = 0;
        filteredUris = 0;
        downloadedUris = 0;
        cacheHits = 0;
        resultNodes = new Vector();
        openUris = new Vector();
        openThreads = 0;
        stopped = false;
        finished = false;

        statisticsLock = new Object();
    }

    /**
     * Get the task's uinique id
     * @return id
     */
    public int getId() {
        return id;
    }

    /**
     * Get the user of this RobotTask
     * @return User of this task
     */
    RobotUser getRobotUser() {
        return robotUser;
    }

    // The following methods are used to access the parameters of the task


    /**
     * Get the start URI of this task
     * @return start URI
     */
    public SimpleUri getStartURI() {
        return startURI;
    }

    /**
     * If this flag is set HEAD instaed of GET is used to contact the server
     * @param headOnly if true, only the head of startUri will be loaded
     */
    public void setHeadOnly(boolean headOnly) {
        this.headOnly = headOnly;
        defaultFilter.setHeadOnly(headOnly);
    }

    /**
     * Checks, if the robot is in headOnly mode
     */
    public boolean getHeadOnly() {
        return headOnly;
    }

    /**
     * Get the crawling depth
     */
    public int getDepth() {
        return depth;
    }

    /**
     * Should the robot obey the robotExclusion. For details see http://www.robotstxt.org/wc/exclusion.html
     * @param obeyRobotExclusion if true, the robot will obey the robot exclusion protocol
     */
    public void setObeyRobotExclusion(boolean obeyRobotExclusion) {
        this.obeyRobotExclusion = obeyRobotExclusion;
    }

    /**
     * Checks, if the robot is in obeyRobotExclusion mode
     * @return true, if the robot obeys the robot exclusion protocol
     */
    public boolean getObeyRobotExclusion() {
        return obeyRobotExclusion;
    }

    /**
     * When should this task expire. After this time the robot will stop this robot task, even if it's processing has not been started yet.
     * @param time time period in milliseconds beginning with the arrival of the task at the robot to the task's expiry.
     */
    public void setExpiry(long time) {
        expiry = time;
    }

    /**
     * Get the expiry time
     * @return expiry time
     */
    public long getExpiry() {
        return expiry;
    }

    /**
     * Max Documents are downloaded from the web. After the robot has downloaded max documents from the web, the task is stopped.
     * Running PageLoaderThreads are not interrupted, so that the actual number of downloaded documents may be higher.
     * @param max download max documents
     */
    public void setMaxDownloadUris(int max) {
        maxDownloadUris = max;
    }

    /**
     * Get the maximum number of documents the robot will download
     * @return max downloaded URIs
     */
    public int getMaxDownloadUris() {
        return maxDownloadUris;
    }

    /**
     * Should the robot check the database before trying to download a document from the web.
     * @param checkDatabase if true the robot allways tries to find linked documents in the database.
     */
    public void setCheckDatabase(boolean checkDatabase) {
        this.checkDatabase = checkDatabase;
    }

    /**
     * See, if the robot checks the database
     * @return true, if the robot checks the database
     */
    public boolean getCheckDatabase() {
        return checkDatabase;
    }

    /**
     * Pages that were accessed (by the robot or the user) before date are downloaded again.
     * @param date date in milliseconds after January 1, 1970 00:00:00 GMT
     */
    public void setUpdateDate(long date) {
        updateDate = date;
    }

    /**
     * Gets the update date
     * @return date in milliseconds after January 1, 1970 00:00:00 GMT
     */
    public long getUpdateDate() {
        return updateDate;
    }

    /**
     * Only the specified amount of bytes are downloaded from each document
     * @param size download only size bytes
     */
    public void setMaxPageSize(int size) {
        maxPageSize = size;
    }

    /**
     * Gets the download size limit
     * @return the maximum amount of bytes the robot will download for each page
     */
    public int getMaxPageSize() {
        return maxPageSize;
    }

    /**
     * The robot will only download a document for the specified time
     * @param maximum download time for each document
     */
    public void setMaxDownloadTime(long time) {
        maxDownloadTime = time;
    }

    /**
     * Gets the maximium download time
     * @return maximum download time for each page
     */
    public long getMaxDownloadTime() {
        return maxDownloadTime;
    }

    /**
     * Adds a PageClassifier to this task. The classifier can add attributes to the page. All classifiers are executed serially.
     * @param pageClassifier add this PageClassifier
     */
    public void addPageClassifier(PageClassifier pageClassifier) {
        pageClassifierList.add(pageClassifier);
    }

    /**
     * Removes a Classifier
     * @param pageClassifier remove this one
     */
    public void removePageClassifier(PageClassifier pageClassifier) {
        pageClassifierList.remove(pageClassifier);
    }

    /**
     * Adds a LinkClassifier to this task. The classifier can add attributes to the link. All classifiers are executed serially.
     * @param linkClassifier add this LinkClassifier
     */
    public void addLinkClassifier(LinkClassifier linkClassifier) {
        linkClassifierList.add(linkClassifier);
    }

    /**
     * Removes a Classifier
     * @param linkClassifier remove this one
     */
    public void removeLinkClassifier(LinkClassifier linkClassifier) {
        linkClassifierList.remove(linkClassifier);
    }

    /**
     * Filters decides whether to stop the crawling at the current document or to continue with the links. The filters are executed serially
     * and a boolean and operation is used for the decision.
     * Adds a PageFilter to this task.
     * @param pageFilter add this PageFilter
     */
    public void addPageFilter(PageFilter pageFilter) {
        pageFilterList.add(pageFilter);
    }

    /**
     * Removes a Filter
     * @param pageFilter remove this one
     */
    public void removePageFilter(PageFilter pageFilter) {
        pageFilterList.remove(pageFilter);
    }

    /**
     * Filters decides whether to follow a link or not. The filters are executed serially
     * and a boolean and operation is used for the decision.
     * Adds a LinkFilter to this task.
     * @param linkFilter add this LinkFilter
     */
    public void addLinkFilter(LinkFilter linkFilter) {
        linkFilterList.add(linkFilter);
    }

    /**
     * Removes a Filter
     * @param linkFilter remove this one
     */
    public void removeLinkFilter(LinkFilter linkFilter) {
        linkFilterList.remove(linkFilter);
    }

    /**
     * Get an Enumeration of all PageClassifiers
     * @return Enumeration of all PageClassifiers
     */
    public Enumeration getPageClassifier() {
        return pageClassifierList.elements();
    }

    /**
     * Get an Enumeration of all LinkClassifiers
     * @return Enumeration of all LinkClassifiers
     */
    public Enumeration getLinkClassifier() {
        return linkClassifierList.elements();
    }

    /**
     * Get an Enumeration of all PageFilter
     * @return Enumeration of all PageFilter
     */
    public Enumeration getPageFilter() {
        return pageFilterList.elements();
    }

    /**
     * Get an Enumeration of all LinkFilter
     * @return Enumeration of all LinkFilter
     */
    public Enumeration getLinkFilter() {
        return linkFilterList.elements();
    }

    /**
     * Get the DefaultFilter
     * @return the DefaultFilter
     */
    DefaultFilter getDefaultFilter() {
        return defaultFilter;
    }

    /**
     * Enables the content-seen-test.
     * If the robot does a content-seen-test, the crawling stops at pages that have been seen before under a different url.
     */
    public void setDoContentSeenTest() {
        doContentSeenTest = true;
        pageFilterList.add(new ContentSeenFilter());
    }

    /**
     * Checks if the robot does a content-seen-test.
     * @return true, if the robot does a content-seen-test
     */
    public boolean getDoContenSeenTest() {
        return doContentSeenTest;
    }

    /**
     * If this is set to true, the robot saves the source code of every document. Documents that are in the database without source are
     * downloaded again.
     * @param requireSourceCode do a content-seen-test or not
     */
    public void setRequireSourceCode(boolean requireSourceCode) {
        this.requireSourceCode = requireSourceCode;
    }

    /**
     * Checks if the robot does a content-seen test
     * @return true, if the robot does a content-seen-test
     */
    public boolean getRequireSourceCode() {
        return requireSourceCode;
    }

    // Methods for the statistics and the crawling-result


    /**
     * Set the arrival time of this task at the robot to the current time
     */
    void setArrivalTime() {
        Date d = new Date();

        arrivalTime = d.getTime();
    }

    /**
     * Get the arrival time of this task at the robot
     * @return arrival time
     */
    public long getArrivalTime() {
        return arrivalTime;
    }

    /**
     * Sets the start time (processing of the start uri) of this task to the current time.
     */
    void setStartTime() {
        Date d = new Date();

        startTime = d.getTime();
    }

    /**
     * Get the start time of this task
     * @return start time
     */
    public long getStartTime() {
        return startTime;
    }

    /**
     * Set the end time of this task to the current time
     */
    void setEndTime() {
        Date d = new Date();

        endTime = d.getTime();
    }

    /**
     * Get the end time for this task
     * @return end time
     */
    public long getEndTime() {
        return endTime;
    }

    /**
     * Increment the number of checked URIs
     */
    void incCheckedUris() {
        synchronized (statisticsLock) {
            checkedUris++;
        }
    }

    /**
     * Get the number of checked URIs.
     * Every link and frame is counted, even if the URI has been checked before.
     * @return checked URIs
     */
    public int getCheckedUris() {
        return checkedUris;
    }

    /**
     * Increment the number of queued URIs
     */
    void incQueuedUris() {
        synchronized (statisticsLock) {
            queuedUris++;
        }
    }

    /**
     * Get the number of queued URIS
     * @return queued URIs
     */
    public int getQueuedUris() {
        return queuedUris;
    }

    /**
     * Increment the number of filtered URIs.
     */
    void incFilteredUris() {
        synchronized (statisticsLock) {
            filteredUris++;
        }
    }

    /**
     * Get the number of filtered URIs. The robot counts all URIs that where filtered by the DefaultFilter (wrong file-extension, restriction)
     * or by the LinkFilters of this task.
     * checkedUris = filteredUris + queuedUris + Uris that have been processed before.
     * @return filtered Uris
     */
    public int getFilteredUris() {
        return filteredUris;
    }

    /**
     * Increment the number of downloaded URIs
     */
    void incDownloadedUris() {
        synchronized (statisticsLock) {
            downloadedUris++;
        }
    }

    /**
     * Get the number of downloaded URIs
     * @return downloaded URIs
     */
    public int getDownloadedUris() {
        return downloadedUris;
    }

    /**
     * Increment the number of cache hits
     */
    void incCacheHits() {
        synchronized (statisticsLock) {
            cacheHits++;
        }
    }

    /**
     * Get the number of cach hits
     * queuedUris = downloadedUris + cacheHits
     * @return cache hits
     */
    public int getCacheHits() {
        return cacheHits;
    }

    /**
     * Add a QueueEntry to the open URIs
     */
    synchronized void addOpenUri(QueueEntry qe) {
        openUris.add(qe);
    }

    /**
     * Moves an element from the list of open URIs to the result set
     * @param qe element from the list of open URIs
     * @param robotHtmlNode this node was retrieved during the processing of qe
     */
    synchronized void moveOpenUriToResult(QueueEntry qe, RobotHtmlNode robotHtmlNode) {
        if (!resultNodes.contains(robotHtmlNode)) {
            resultNodes.add(robotHtmlNode);
        }
        openUris.remove(qe);
    }

    /**
     * Checks if there's an element in this task's list of open URIs which is equal to uri
     * @param uri look for this uri
     * @return true, if a similar URI is open
     */
    public synchronized boolean isOpenUri(SimpleUri uri) {
        Enumeration links = openUris.elements();

        while (links.hasMoreElements()) {
            QueueEntry qe = (QueueEntry) links.nextElement();
            SimpleUri link = qe.getURI();

            if (link.equals(uri)) {
                return true;
            }
        }
        return false;
    }

    /**
     * Checks if there's an element in this task's list of open URIs which is equal to uri
     * @param uri look for this uri
     * @return QueueEntry with an URI equal to the parameter uri
     */
    public synchronized QueueEntry getOpenUri(SimpleUri uri) {
        Enumeration links = openUris.elements();

        while (links.hasMoreElements()) {
            QueueEntry qe = (QueueEntry) links.nextElement();
            SimpleUri link = qe.getURI();

            if (link.equals(uri)) {
                return qe;
            }
        }
        return null;
    }

    /**
     * Get the number of open URIs for this task.
     * @return number of open URIs
     */
    public synchronized int getNumberOfOpenUris() {
        return openUris.size();
    }

    /**
     * Get the URIs of this task the robot is currently working on.
     * @return Enumeration of the URIs
     */
    public synchronized Enumeration getOpenUris() {
        return openUris.elements();
    }

    /**
     * Adds an element to the result set
     * @param robotHtmlNode add this node
     */
    synchronized public void addResultNode(RobotHtmlNode robotHtmlNode) {
        resultNodes.add(robotHtmlNode);
    }

    /**
     * Checks if this URI is in the result
     * @param uri check this URI
     * @return true, if uri is in the result
     */
    public synchronized boolean isResultUri(SimpleUri uri) {
        Enumeration links = resultNodes.elements();

        while (links.hasMoreElements()) {
            RobotHtmlNode robotHtmlNode = (RobotHtmlNode) links.nextElement();
            SimpleUri link = null;

            try {
                link = robotHtmlNode.getSimpleUri();
            } catch (Exception e) {}
            if (link.equals(uri)) {
                return true;
            }
        }
        return false;
    }

    /**
     * Get the number of result nodes
     * @return number of result nodes
     */
    public synchronized int getNumberOfResultNodes() {
        return resultNodes.size();
    }

    /**
     * Get all result nodes
     * @return Enumeration of the result nodes
     */
    public synchronized Enumeration getResultNodes() {
        return resultNodes.elements();
    }

    /**
     * Get the result node with the URI equal to the parameter uri
     * @param uri get the result node for this URI
     * @return RobotHtmlNode with URI equal to the parameter uri or null
     */
    public synchronized RobotHtmlNode getResultNode(SimpleUri uri) {
        Enumeration links = resultNodes.elements();

        while (links.hasMoreElements()) {
            RobotHtmlNode robotHtmlNode = (RobotHtmlNode) links.nextElement();
            SimpleUri link = null;

            try {
                link = robotHtmlNode.getSimpleUri();
            } catch (Exception e) {}
            if (link.equals(uri)) {
                return robotHtmlNode;
            }
        }
        return null;
    }

    /**
     * Increase the number of open Threads used for this task. This method should only be called from an environment  synchronized to urlQueue.
     */
    void incOpenThreads() {
        openThreads++;
    }

    /**
     * Decrease the number od open Threads used for this task. This method should only be called from an environment  synchronized to urlQueue.
     */
    void decOpenThreads() {
        openThreads--;
    }

    /** Using this function, a PageLoaderThread can see, if it is the only one currently working at this task. This method should only be called from an environment  synchronized to urlQueue.
     * @return true, if only one thread ist working at this task
     */
    boolean noOtherThreadsOpen() {
        return (openThreads == 1);
    }

    /** Returns the number of PageLoaderThreads currently working at this task. This method should only be called from an environment  synchronized to urlQueue.
     * @return Number of threads
     */
    int openThreads() {
        return openThreads;
    }

    /**
     * Sets the stopped flag of the task
     */
    void stop() {
        stopped = true;
    }

    /**
     * Get the value of the stop flag
     * @return True, if the task was stopped unfinished
     */
    public boolean wasStopped() {
        return stopped;
    }

    /**
     * Sets the finished flag of the robotTask
     */
    void setFinished() {
        finished = true;
    }

    /**
     * Checks if the task was finished
     * @return True, if the task was finished
     **/
    boolean isFinished() {
        return finished;
    }

}
