#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 2010 British Broadcasting Corporation and Kamaelia Contributors(1)
# (1) Kamaelia Contributors are listed in the AUTHORS file and at
#     http://www.kamaelia.org/AUTHORS - please extend this file,
#     not this notice.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.
# -------------------------------------------------------------------------
# Licensed to the BBC under a Contributor Agreement: RJL
Single-Shot HTTP Client
This component is for downloading a single file from an HTTP server.
Pick up data received from the server on its "outbox" outbox.
Generally you should use SimpleHTTPClient in preference to this.
Example Usage
How to use it::
If you want to use it directly, note that it doesn't output strings
but ParsedHTTPHeader, ParsedHTTPBodyChunk and ParsedHTTPEnd like
HTTPParser. This makes has the advantage of not buffering huge
files in memory but outputting them as a stream of chunks.
(with plain strings you would not know the contents of the
headers or at what point that response had ended!)
How does it work?
SingleShotHTTPClient accepts a URL parameter at its creation (to __init__).
When activated it creates an HTTPParser instance and then connects
to the webserver specified in the URL using a TCPClient component.
It sends an HTTP request and then any response from the server is received
by the HTTPParser.
HTTPParser processes the response and outputs it in parts as::
If SingleShotHTTPClient detects that the requested URL is a redirect page
(using the Location header) then it begins this cycle anew with the URL
of the new page, otherwise the parts of the page output by HTTPParser are
sent on to "outbox". 
Simple HTTP Client
This component downloads the pages corresponding to HTTP URLs received
on "inbox" and outputs their contents (file data) as a message, one per
URL, to "outbox" in the order they were received.
Example Usage
Type URLs, and they will be downloaded and placed, back to back in "downloadedfile.txt"::
        ConsoleReader(">>> ", ""),
How does it work?
SimpleHTTPClient uses the Carousel component to create a new
SingleShotHTTPClient component for every URL requested. As URLs are
handled sequentially, it has only one SSHC child at anyone time.
import string, time
from Axon.Component import component
from Axon.Ipc import producerFinished, shutdown
from Kamaelia.Util.Console import ConsoleReader, ConsoleEchoer
from Kamaelia.Chassis.Carousel import Carousel
from Kamaelia.Internet.TCPClient import TCPClient
from Kamaelia.Protocol.HTTP.HTTPParser import *
class ParsedHTTPRedirect(object):
    def __init__(self, redirectto):
        self.redirectto = redirectto
def intval(mystring):
    """Convert a string to an integer, representing errors by None"""
        retval = int(mystring)
    except ValueError:
        retval = None
    return retval
def removeTrailingCr(line):
    if len(line) == 0:
        return ""
    elif line[-1] == "\r":
        return line[0:-1]
        return line
class HTTPRequest(object):
    def __init__(self, requestobject, redirectcount):
        super(HTTPRequest, self).__init__()
        self.requestobject = requestobject
        self.redirectcount = redirectcount
def AttachConsoleToDebug(comp):
    comp.debuggingconsole = ConsoleEchoer()
    comp.link((comp, "debug"), (comp.debuggingconsole, "inbox"))
class SingleShotHTTPClient(component): 
    SingleShotHTTPClient() -> component that can download a file using HTTP by URL
    - starturl     -- the URL of the file to download
    - [postbody]   -- data to POST to that URL - if set to None becomes an empty body in to a POST (of PUT) request
    - [connectionclass] -- specify a class other than TCPClient to connect with
    - [method]     -- the HTTP method for the request (default to GET normally or POST if postbody != ""
    Inboxes =  {
        "inbox"          : "UNUSED",
        "control"        : "UNUSED",
        "_parserinbox"   : "Data from HTTP parser",
        "_parsercontrol" : "Signals from HTTP parser",
        "_tcpcontrol"    : "Signals from TCP client",
    Outboxes = {
        "outbox"         : "Requested file",
        "debug"          : "Output to aid debugging",
        "_parsersignal"  : "Signals for HTTP parser",
        "_tcpoutbox"     : "Send over TCP connection",
        "_tcpsignal"     : "Signals shutdown of TCP connection",
        "signal"         : "UNUSED"
    def __init__(self, starturl, postbody = "", connectionclass = TCPClient, extraheaders = None, method = None):
#        print "SingleShotHTTPClient.__init__()"
        super(SingleShotHTTPClient, self).__init__()
        self.tcpclient = None
        self.httpparser = None
        self.requestqueue = []
        self.starturl = starturl
        self.connectionclass = connectionclass
        self.method = method
        self.postbody = postbody
        if extraheaders is not None:
            self.extraheaders = extraheaders
            self.extraheaders = {}
    def formRequest(self, url):
        """Craft a HTTP request string for the supplied url"""
        splituri = splitUri(url)
        host = splituri["uri-server"]
        if ("uri-port" in splituri):
            host += ":" + splituri["uri-port"]
        splituri["request"] = []
        method = self.method
        if self.postbody == "":
            if not method:
                method = 'GET'
            splituri["request"].append(method + " " + splituri["raw-uri"] + " HTTP/1.1\r\n")
            if not method:
                method = 'POST'
            splituri["request"].append(method + " " + splituri["raw-uri"] + " HTTP/1.1\r\n")
            if self.postbody != None:
                splituri["request"].append("Content-Length: " + str(len(self.postbody)) + "\r\n")
                splituri["request"].append("Content-Length: 0\r\n")
        splituri["request"].append("Host: " + host + "\r\n")
        splituri["request"].append("User-agent: Kamaelia HTTP Client 0.3 (RJL)\r\n")
        splituri["request"].append("Connection: Keep-Alive\r\n") # keep-alive is a work around for lack of shutdown notification in TCPClient
        for header in self.extraheaders:
            splituri["request"].append("%s: %s\r\n" % (header, self.extraheaders[header]))
        splituri["request"] = [string.join(splituri["request"], "")] # might improve performance by sending more together
#        print splituri["request"]
        if self.postbody not in [None, ""]:
        return splituri
    def makeRequest(self, request):
        """Connect to the remote HTTP server and send request"""
        self.tcpclient = None
        self.httpparser = None
        port = intval(request.requestobject.get("uri-port", ""))
        if port == None:
            port = 80
        self.tcpclient = self.connectionclass(request.requestobject["uri-server"], port)
        self.httpparser = HTTPParser(mode="response")
        self.link( (self, "_tcpoutbox"),       (self.tcpclient, "inbox") )
        self.link( (self, "_tcpsignal"),       (self.tcpclient, "control") )
        self.link( (self.tcpclient, "signal"), (self, "_tcpcontrol") )
        self.link( (self.tcpclient, "outbox"), (self.httpparser, "inbox") ) #incoming TCP data -> HTTPParser directly
        self.link( (self, "_parsersignal"), (self.httpparser, "control") )
        self.link( (self.httpparser, "outbox"), (self, "_parserinbox") )
        self.link( (self.httpparser, "signal"), (self, "_parsercontrol") )
        self.addChildren( self.tcpclient, self.httpparser )
        self.response = ""
        if isinstance(request.requestobject["request"], str):
            self.send(request.requestobject["request"], "_tcpoutbox")
            for part in request.requestobject["request"]:
                self.send(part, "_tcpoutbox")
    def shutdownKids(self):
        """Close TCP connection and HTTP parser"""
        if self.tcpclient != None and self.httpparser != None:
            self.send(producerFinished(), "_tcpsignal")
            self.send(shutdown(), "_parsersignal")
            self.tcpclient = None
            self.httpparser = None
    def handleRedirect(self, header):
        """Check for a redirect response and queue the fetching the page it points to if it is such a response.
        Returns true if it was a redirect page and false otherwise."""
        if header["responsecode"] in ["301", "302", "303", "307"]:
            # location header gives the redirect URL
            newurl = header["headers"].get("location", "")
            if newurl != "":
                self.send(ParsedHTTPRedirect(redirectto=newurl), "outbox")
                redirectedrequest = HTTPRequest(self.formRequest(newurl), self.currentrequest.redirectcount + 1)
                return True
                return False
                # do something equivalent to what we'd do for 404
            return False
    def main(self):
        """Main loop."""
        self.requestqueue.append(HTTPRequest(self.formRequest(self.starturl), 0))
        while self.mainBody():
#            print "SingleShotHTTPClient.main"
            yield 1
        self.send(producerFinished(self), "signal")
        yield 1
    def mainBody(self):
        """Called repeatedly by main loop. Checks inboxes and processes messages received.
        Start the fetching of the new page if the current one is a redirect and has been
        completely fetched."""
        self.send("SingleShotHTTPClient.mainBody()", "debug")
        while self.dataReady("_parserinbox"):
            msg = self.recv("_parserinbox")
            if isinstance(msg, ParsedHTTPHeader):
                self.send("SingleShotHTTPClient received a ParsedHTTPHeader on _parserinbox", "debug")
                # if the page is a redirect page
                if not self.handleRedirect(msg.header):
                    if msg.header["responsecode"] == "200":
                        self.send(msg, "outbox") # if not redirecting then send the response on
                    else:  #treat as not found
            elif isinstance(msg, ParsedHTTPBodyChunk):
                self.send("SingleShotHTTPClient received a ParsedHTTPBodyChunk on _parserinbox", "debug")
                if len(self.requestqueue) == 0: # if not redirecting then send the response on
                    self.send(msg, "outbox")
            elif isinstance(msg, ParsedHTTPEnd):
                self.send("SingleShotHTTPClient received a ParsedHTTPEnd on _parserinbox", "debug")
                if len(self.requestqueue) == 0: # if not redirecting then send the response on
                    self.send(msg, "outbox")
                return 1
        while self.dataReady("_parsercontrol"):
            temp = self.recv("_parsercontrol")
            self.send("SingleShotHTTPClient received something on _parsercontrol", "debug")
        while self.dataReady("_tcpcontrol"):
            msg = self.recv("_tcpcontrol")
            self.send(msg, "_parsersignal")
        while self.dataReady("control"):
            msg = self.recv("control")
            if isinstance(msg, shutdown):
                return 0
        # if we're not currently downloading a page
        if self.tcpclient == None:
            # then either we've finished or we should download the next URL (if we've been redirected)
            if len(self.requestqueue) > 0:
                self.currentrequest = self.requestqueue.pop(0)
                if self.currentrequest.redirectcount == 3: # 3 redirects is excessive, give up, we're probably in a loop anyway
                    return 0
                return 0
        return 1
def makeSSHTTPClient(paramdict):
    """Creates a SingleShotHTTPClient for the given URL. Needed for Carousel."""
    # get the "url" and "postbody" keys from paramdict to use as the arguments of SingleShotHTTPClient
    return SingleShotHTTPClient(paramdict.get("url", ""), 
                                paramdict.get("postbody", ""),
                                extraheaders = paramdict.get("extraheaders", None),
                                method = paramdict.get('method', None)
class SimpleHTTPClient(component):
    Inboxes = {
        "inbox"           : "URLs to download - a dict {'url':'x', 'postbody':'y'} or a just the URL as a string ",
        "control"         : "Shut me down",
        "_carouselready"  : "Receive NEXT when carousel has completed a request",
        "_carouselinbox"  : "Data from SingleShotHTTPClient via Carousel"
    Outboxes = {
        "outbox"          : "Requested file's data string",
        "signal"          : "Signal I have shutdown",
        "_carouselnext"   : "Create a new SingleShotHTTPClient",
        "_carouselsignal" : "Shutdown the carousel",
        "debug"           : "Information to aid debugging"
    def __init__(self):
        """Create and link to a carousel object"""
        super(SimpleHTTPClient, self).__init__()
        # now create our Carousel subcomponent
        self.carousel = Carousel(componentFactory=makeSSHTTPClient)
        self.link((self, "_carouselnext"),        (self.carousel, "next"))
        self.link((self, "_carouselsignal"),      (self.carousel, "control"))
        self.link((self.carousel, "outbox"),      (self, "_carouselinbox"))
        self.link((self.carousel, "requestNext"), (self, "_carouselready"))
    def cleanup(self):
        """Destroy child components and send producerFinished when we quit."""
        self.send(producerFinished(self), "_carouselsignal") #shutdown() not currently supported by Carousel
        self.send(producerFinished(self), "signal")
    def debug(self, msg):
        self.send(msg, "debug")
    def main(self):
        """Main loop."""
        finished = False
        while not finished:
            yield 1
            while self.dataReady("inbox"):
                paramdict = self.recv("inbox")
                # we accept either string or dict messages - if it's a string then
                # we assume you mean that's the URL you want fecthed
                if isinstance(paramdict, str):
                    paramdict = { "url": paramdict }
                self.debug("SimpleHTTPClient received url " + paramdict.get("url","") + "\n")
                # request creation of a new SingleShotHTTPClient by Carousel
                self.send(paramdict, "_carouselnext")
                # store as a list of strnigs then join at the 
                # end to avoid O(n^2) time string cat'ing behaviour
                filebody = []
                carouselbusy = True
                while carouselbusy:
                    yield 1
                    while self.dataReady("_carouselinbox"):
                        msg = self.recv("_carouselinbox")
                        if isinstance(msg, ParsedHTTPBodyChunk):
                    while self.dataReady("control"):
                        msg = self.recv("control")
                        if isinstance(msg, producerFinished):
                            finished = True
                        elif isinstance(msg, shutdown):
                    while self.dataReady("_carouselready"):
                        msg = self.recv("_carouselready")
                        carouselbusy = False
                self.send(string.join(filebody, ""), "outbox")
                filebody = [] # free up some memory used by the now unneeded list
            while self.dataReady("control"):
                msg = self.recv("control")
                if isinstance(msg, producerFinished):
                    finished = True
                elif isinstance(msg, shutdown):
        self.debug("eoml in SimpleHTTPClient\n")
        yield 1
__kamaelia_components__  = ( SimpleHTTPClient, SingleShotHTTPClient, )
if __name__ == '__main__':
    from Kamaelia.Chassis.Pipeline import Pipeline
    from Kamaelia.Util.Console import ConsoleReader, ConsoleEchoer
    from Kamaelia.File.Writing import SimpleFileWriter
    # Example - type in a URL e.g. http://www.google.co.uk and have that page saved to disk
        ConsoleReader(">>> ", ""),