/*
 * Scone - The Web Enhancement Framework
 * Copyright (C) 2009 Harald Weinreich, Volkert Buchmann, Frank Wollenweber, Torsten Ha
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
 package scone.robot;


import java.util.Enumeration;
import java.util.Properties;
import java.util.StringTokenizer;
import java.util.Timer;
import java.util.Vector;

import scone.netobjects.SimpleUri;
import scone.util.PersistentProperties;
import Acme.NoRobots;


/**
 * The central class of the scone.robot package
 * <br>
 * Use the static method <code>instance()</code> to get a robot object.<br>
 * All objects which use services of the robot package need only communicate with the robot object.
 * A class which uses the robot must implement the RobotUser interface.
 * <br><br>
 * As you can see in the following example the use of the robot is quite simple:
 * <br><br>
 * <code>
 *  public class RobotTest implements RobotUser { // All classes that use the robot must implement this interface<br>
 *  ...<br>
 *  Robot robot = Robot.instance(); // Get a reference to the Scone-Robot<br>
 *  SimpleUri startUri = new SimpleUri("http://www.informatik.uni-hamburg.de"); // The robot starts with this URI<br>
 *  RobotTask rt = new RobotTask(startUri,3,RobotTask.ALL,this); // Create a new RobotTask<br>
 *  rt.setMaxPageSize(5000); // Set some properties for this task<br>
 *  ...<br>
 *  robot.scan(rt);<br>
 * </code>
 *
 *
 * @author Frank Wollenweber
 */



public class Robot {
    
    static private Robot instance = null; // the only instance of the robot. Used for the implementation of the singleton pattern
    private PageLoaderPool pageLoaderPool; // implementation of a threadpool for the efficient handling of the pageLoaderThreads
    private HttpConnectionPool httpConnectionPool; // pool of http connections
    private URLQueue urlQueue; // queue for the pending urls
    private NoRobots noRobotsTester; // implementation of the robot exclusion protocol
    private Timer timer; // This timer is used to interrupt PageLoaderThreads
    private Vector robotTasks; // the pending robotTasks
    private String robotName; // name of the robot used in the user-agent field of the http headers
    private int maxConcurrentRequests; // maximum requests per server (currently not implemented)
    private int timeout; // timeout for the http connections in milliseconds
    private int maxThreads; // the maximum number of threads
    private int maxIdleTime; // The number of milliseconds a thread in the pageLoaderPool is allowed to be idle before it is killed
    private int threadPriority; // Priority of the pageLoaderThreads;
    private Vector validFileExtensions; // The robot should only download Files with these extensions
    static private int lastRobotTaskId; // The id of the last RobotTask

    /* Implementation of the singleton pattern */

    /**
     * Default constructor of the robot
     */
    private Robot() {
        // load preferences for the Robot
        PersistentProperties props = new PersistentProperties("config/scone/robot.xml");

        maxThreads = 5;
        maxIdleTime = 3000;
        threadPriority = Thread.NORM_PRIORITY;
        robotName = "Scone-Robot";
        timeout = 5000;
        validFileExtensions = new Vector();
        validFileExtensions.add("htm");
        validFileExtensions.add("html");
        String queue = "scone.robot.BreadthSearchURLQueue";

        if (props.get("Maximum number of threads") != null) {
            maxThreads = Integer.parseInt(props.get("Maximum number of threads"));
            if ((maxThreads < 1) || (maxThreads > 100)) {
                throw new NumberFormatException();
            }
        } else {
            throw new NumberFormatException();
        }
        if (props.get("Timeout for idle threads") != null) {
            maxIdleTime = Integer.parseInt(props.get("Timeout for idle threads"));
            if ((maxIdleTime < 1) || (maxIdleTime > 100000)) {
                throw new NumberFormatException();
            }
        } else {
            throw new NumberFormatException();
        }
        if (props.get("Thread priority") != null) {
            threadPriority = Integer.parseInt(props.get("Thread priority"));
            if ((threadPriority < Thread.MIN_PRIORITY)
                    || (threadPriority > Thread.MAX_PRIORITY)) {
                throw new NumberFormatException();
            }
        } else {
            throw new NumberFormatException();
        }
        if (props.get("Robot name") != null) {
            robotName = props.get("Robot name");
        }
        if (props.get("Network timeout") != null) {
            timeout = Integer.parseInt(props.get("Network timeout"));
            if ((timeout < 1) || (timeout > 100000)) {
                throw new NumberFormatException();
            }
        } else {
            throw new NumberFormatException();
        }
        if (props.get("Valid file extensions") != null) {
            String extensions = props.get("Valid file extensions");

            extensions = extensions.toLowerCase();
            validFileExtensions = new Vector();
            StringTokenizer st = new StringTokenizer(extensions);

            while (st.hasMoreTokens()) {
                String e = st.nextToken();

                validFileExtensions.add(e);
            }
        }
        if (props.get("Queue class") != null) {
            queue = props.get("Queue class");
        }
        Class c = null;

        try {
            c = Class.forName(queue);
            urlQueue = (URLQueue) c.newInstance();
        } catch (Exception e) {
            System.out.println(e.toString());
        }
        Properties pageLoaderPoolProperties = new Properties();

        pageLoaderPoolProperties.setProperty("minThreads", "0");
        pageLoaderPoolProperties.setProperty("maxThreads", Integer.toString(maxThreads));
        pageLoaderPoolProperties.setProperty("maxIdleTime", Integer.toString(maxIdleTime));
        pageLoaderPoolProperties.setProperty("priority", Integer.toString(threadPriority));
        pageLoaderPool = new PageLoaderPool(pageLoaderPoolProperties, this, urlQueue);
        httpConnectionPool = new HttpConnectionPool(maxThreads * 2);
        noRobotsTester = new NoRobots(robotName);
        timer = new Timer(true);
        robotTasks = new Vector();
        lastRobotTaskId = 0;
    }

    /**
     * Use this method to get an instance of the robot. If there is no robot active, the default constructor of the robot is used.
     */
    static public Robot instance() {
        if (instance == null) {
            instance = new Robot();
        }
        return instance;
    }

    /* Handling RobotTasks */

    /**
     * With this method you can advice the robot to execute a specified task.
     * @param robotTask specifies what the robot should do.
     */
    public void scan(RobotTask robotTask) {
        robotTask.setArrivalTime();
        robotTask.incCheckedUris();
        if (robotTask.getDefaultFilter().filter(robotTask.getStartURI())) {
            robotTasks.add(robotTask);
            QueueEntry qe = new QueueEntry(robotTask.getStartURI(), robotTask.getDepth(), robotTask);

            queue(qe);
            robotTask.incQueuedUris();
            RobotTaskExpiryTimer et = new RobotTaskExpiryTimer(robotTask, this);

            if (qe.getRobotTask().getExpiry() != -1) {
                timer.schedule(et, qe.getRobotTask().getExpiry());
            }
        } else {
            robotTask.incFilteredUris();
            robotTask.getRobotUser().robotTaskFinished(robotTask);
        }
    }

    /**
     * With this method you can advice the robot to stop a specified task.
     * All entries of this task in the urlqueue are deleted. The pages currently downloading are finished.
     * @param robotTask specifies the task to stop.
     */
    public void stopRobotTask(RobotTask robotTask) {
        robotTask.stop();
        urlQueue.removeAllQueueEntries(robotTask);
        synchronized (urlQueue) {
            if (robotTask.openThreads() == 0) {
                robotTasks.remove(robotTask);
                if (!robotTask.isFinished()) {
                    robotTask.getRobotUser().robotTaskFinished(robotTask);
                }
                robotTask.setFinished();
            }
        }
    }

    /**
     * With this method you can advice the robot to stop a specified task.
     * All entries of this task in the urlqueue are deleted. The pages currently downloading are finished.
     * @param id of the robotTask.
     */
    public void stopRobotTask(int robotTaskId) {
        Enumeration tasks = robotTasks.elements();

        while (tasks.hasMoreElements()) {
            RobotTask robotTask = (RobotTask) tasks.nextElement();

            if (robotTask.getId() == robotTaskId) {
                robotTask.stop();
                urlQueue.removeAllQueueEntries(robotTask);
                synchronized (urlQueue) {
                    if (robotTask.openThreads() == 0) {
                        robotTasks.remove(robotTask);
                        if (!robotTask.isFinished()) {
                            robotTask.getRobotUser().robotTaskFinished(robotTask);
                        }
                        robotTask.setFinished();
                    }
                }
                break;
            }
        }
    }

    /**
     * Get all robot tasks the robot currently has to handle.
     * @return The vector containing all robotTasks
     */
    public Vector getRobotTasks() {
        return robotTasks;
    }

    /**
     * Get the number of robot tasks the robot currently has to handle.
     * @return number of robotTasks
     */
    public int getNumberOfRobotTasks() {
        return robotTasks.size();
    }

    /**
     * This method is called after every finished page.<br>
     * It calls the <code>robotNewPage(RobotHtmlNode robotHtmlNode, RobotTask robotTask)</code> method of the <code>RobotUser</code> who started the task.
     * When a task is finished <code>robotTaskFinished(RobotTask robotTask)</code> of the <code>RobotUser</code> is called
     * @param qe the entry of the <code>URLQueue</code> which was processed.
     * @param robotHtmlNode the downloaded webpage
     */
    void pageFinished(QueueEntry qe, RobotHtmlNode robotHtmlNode) {
        qe.getRobotTask().getRobotUser().robotNewPage(robotHtmlNode, qe.getRobotTask());
        synchronized (urlQueue) {
            if ((urlQueue.getNumberOfPendingQueueEntries(qe.getRobotTask()) == 0)
                    && (qe.getRobotTask().getNumberOfOpenUris() == 0)
                    && (qe.getRobotTask().noOtherThreadsOpen())
                    && (!qe.getRobotTask().isFinished())) {
                qe.getRobotTask().setEndTime();
                robotTasks.remove(qe.getRobotTask());
                long difference = (qe.getRobotTask().getEndTime()
                        - qe.getRobotTask().getStartTime());

                qe.getRobotTask().getRobotUser().robotTaskFinished(qe.getRobotTask());
                qe.getRobotTask().setFinished();
            }
            qe.getRobotTask().decOpenThreads();
        }
    }

    /* methods for getting attributes of the robot */

    /**
     * Get the name of the robot used in the http requests in the user agent field
     * @return name of the robot
     */
    public String getRobotName() {
        return robotName;
    }

    /**
     * The maximum number of concurrent requests the robot will send to one server
     * @return number of concurrent requests
     */
    public int getMaxConcurrentRequests() {
        return maxConcurrentRequests;
    }

    /**
     * The timeout for the http connections
     * @return timeout
     */
    public int getTimeout() {
        return timeout;
    }

    /**
     * The maximum number of threads in the threadpool
     * @return max number of threads
     */
    public int getMaxNumberOfThreads() {
        return maxThreads;
    }

    /**
     * The number of milliseconds a thread is allowed to be idle before it is killed
     * @return idle time
     */
    public int getMaxIdleTime() {
        return maxIdleTime;
    }

    /* methods for accessing the PageLoaderPool and getting status information */

    /**
     * Add a new <code>QueueEntry</code> to the <code>URLQueue</code> of the <code>PageLoaderPool</code>.
     * @param qe a <code>QueueEntry</code> which belongs to a <code>RobotTask</code> and which advices the robot to scan a page in a certain depth
     */
    synchronized void queue(QueueEntry qe) {
        pageLoaderPool.queue(qe);
    }

    /**
     * Remove a queueEntry from the urlQueue
     * @param qe the queueEntry to remove
     */
    synchronized void removeQueueEntry(QueueEntry qe) {
        urlQueue.removeQueueEntry(qe);
    }

    /**
     * Remove a queueEntries of the specified robotTask
     * @param robotTask removes all queue entries of this task
     */
    synchronized void removeAllQueueEntries(RobotTask robotTask) {
        urlQueue.removeAllQueueEntries(robotTask);
    }

    /**
     * Print the status of the threadpool
     */
    public synchronized void printPageLoaderPoolStatus() {
        System.out.println(pageLoaderPool.getStats());
    }

    /**
     * Get the status of the threadpool
     * @return status object
     */
    public synchronized PageLoaderPoolStats getPageLoaderPoolStatus() {
        return pageLoaderPool.getStats();
    }

    /**
     * The number of jobs in the threadpool. These jobs may belong to many robotTasks
     * @return number of jobs in the threadpool
     */
    public synchronized int getJobsInPageLoaderPool() {
        return (pageLoaderPool.getStats().jobsInProgress
                + pageLoaderPool.getStats().pendingJobs);
    }

    /**
     * Use this method to check if an url is pending in the execution of a robotTask
     * @param uri the URI to check
     * @param robotTask only QueueEntries of the task a considered during the search
     * @return true, if the url is pending
     */
    public synchronized boolean isPendingURL(SimpleUri uri, RobotTask robotTask) {
        return urlQueue.isPendingURL(uri, robotTask);
    }

    /**
     * Use this method to get the QueueEntry with the same url from the URLQueue which belongs to the robotTask
     * @param uri the URI to check
     * @param robotTask only QueueEntries of the task a considered during the search
     * @return the queueEntry which fits to the parameters
     */
    public synchronized QueueEntry getPendingURL(SimpleUri uri, RobotTask robotTask) {
        return urlQueue.getPendingURL(uri, robotTask);
    }

    /**
     * Use this method to get the Vector of QueueEntries for this uri
     * @param uri the URI to check
     * @return the Vector of queueEntries for this uri
     */
    public synchronized Vector getPendingQueueEntries(SimpleUri uri) {
        return urlQueue.getPendingQueueEntries(uri);
    }

    /**
     * Get the number of QueueEntries for this uri
     * @param uri the URI to check
     * @return number of QueueEntries for this Uri
     */
    public synchronized int getNumberOfPendingQueueEntries(SimpleUri uri) {
        return urlQueue.getNumberOfPendingQueueEntries(uri);
    }

    /**
     * Get all pending QueueEntries which belong to robotTask
     * @param robotTask get entries from this robotTask
     * @return Vector of queuEntries of queueEntries
     */
    public synchronized Vector getPendingQueueEntries(RobotTask robotTask) {
        return urlQueue.getPendingQueueEntries(robotTask);
    }

    /**
     * Get the number of QueueEntries from this robotTask
     * @param url the URL to check
     * @param robotTask only QueueEntries of the task a considered during the search
     * @return number of queueEntries
     */
    public synchronized int getNumberOfPendingQueueEntries(RobotTask robotTask) {
        return urlQueue.getNumberOfPendingQueueEntries(robotTask);
    }

    /* Service functions for the other classes of the robot package */


    /**
     * Tests, if it is ok to load a document conforming to the robot exclusion standard
     * @param url the url to test
     * @return returns <code>true</code>, if it is ok to load the page
     */
    boolean noRobotsTest(SimpleUri uri) {
        return noRobotsTester.ok(uri.toURL());
    }

    /**
     * Get the httpConnectionPool for downloading pages using existing connections
     * @return the httpConnectionPool
     */
    HttpConnectionPool getHttpConnectionPool() {
        return httpConnectionPool;
    }

    /**
     * Get the timer used for interrupting PageLoaederThreads
     * @return timer
     */
    Timer getTimer() {
        return timer;
    }

    /**
     * Checks the file if the extension is one of the extensions specified in robot.xml
     * @param file the file to check
     * @return true, if the extension is valid
     */
    public boolean isValidFileExtension(String file) {
        if (file.length() == 0) {
            return true;
        } // Empty file name
        if (file.indexOf(".") == -1) {
            return true;
        } // No file extension
        file = file.toLowerCase();
        Enumeration extensions = validFileExtensions.elements();

        while (extensions.hasMoreElements()) {
            String extension = (String) extensions.nextElement();

            extension = "." + extension;
            if (file.indexOf(extension) != -1) {
                return true;
            }
        }
        return false;
    }

    /**
     * Get the next unique RobotTask ID
     * @return id
     */
    public static synchronized int getNextRobotTaskId() {
        lastRobotTaskId++;
        return lastRobotTaskId;
    }

}
