#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright 2010 British Broadcasting Corporation and Kamaelia Contributors(1) # # (1) Kamaelia Contributors are listed in the AUTHORS file and at # http://www.kamaelia.org/AUTHORS - please extend this file, # not this notice. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ------------------------------------------------------------------------- """\ ================= Single-Shot HTTP Client ================= This component is for downloading a single file from an HTTP server. Pick up data received from the server on its "outbox" outbox. Example Usage ------------- Generally you should use SimpleHTTPClient in preference to this. If you want to use it directly, note that it doesn't output strings but ParsedHTTPHeader, ParsedHTTPBodyChunk and ParsedHTTPEnd like HTTPParser. pipeline( SingleShotHTTPClient("http://www.google.co.uk/"), SomeComponentThatUnderstandsThoseMessageTypes() ).run() How does it work? ----------------- SingleShotHTTPClient creates an HTTPParser instance and then connects to the HTTP server using a TCPClient component. It sends an HTTP request and then any response from the server is received by the HTTPParser. HTTPParser processes the response and outputs it in parts as: ParsedHTTPHeader, ParsedHTTPBodyChunk, ParsedHTTPBodyChunk, ... ParsedHTTPBodyChunk, ParsedHTTPEnd If SingleShotHTTPClient detects that the requested URL is a redirect page (using the Location header) then it begins this cycle anew with the URL of the new page, otherwise the parts of the page output by HTTPParser are sent on through "outbox". ================= Simple HTTP Client ================= This component downloads the pages corresponding to HTTP URLs received on "inbox" and outputs their contents (file data) as a message, one per URL, in the order they were received. Example Usage ------------- pipeline( ConsoleReader(">>> ", ""), SimpleHTTPClient(), SimpleFileWriter("downloadedfile.txt"), ).run() How does it work? ----------------- SimpleHTTPClient uses the Carousel component to create a new SingleShotHTTPClient component for every URL requested. As URLs are handled sequentially, there is only one SSHC at anyone time. """ from Axon.Component import component from Kamaelia.Util.Console import ConsoleReader, ConsoleEchoer from Kamaelia.Chassis.Carousel import Carousel from Axon.Ipc import producerFinished, shutdownMicroprocess, shutdown from Kamaelia.Internet.TCPClient import TCPClient as TCPClient import string, time from HTTPParser import * def intval(mystring): try: retval = int(mystring) except ValueError: retval = None return retval def removeTrailingCr(line): if len(line) == 0: return "" elif line[-1] == "\r": return line[0:-1] else: return line class HTTPRequest(object): def __init__(self, requestobject, redirectcount): super(HTTPRequest, self).__init__() self.requestobject = requestobject self.redirectcount = redirectcount def AttachConsoleToDebug(comp): comp.debuggingconsole = ConsoleEchoer() comp.link((comp, "debug"), (comp.debuggingconsole, "inbox")) comp.debuggingconsole.activate() class SingleShotHTTPClient(component): """\ SingleShotHTTPClient() -> component that can download a file using HTTP by URL Arguments: - starturl -- the URL of the file to download - [postbody] -- data to POST to that URL - [connectionclass] -- specify a class other than TCPClient to connect with """ Inboxes = { "inbox" : "UNUSED", "control" : "UNUSED", "_parserinbox" : "Data from HTTP parser", "_parsercontrol" : "Signals from HTTP parser", "_tcpcontrol" : "Signals from TCP client", } Outboxes = { "outbox" : "Requested file", "debug" : "Output to aid debugging", "_parsersignal" : "Signals for HTTP parser", "_tcpoutbox" : "Send over TCP connection", "_tcpsignal" : "Signals shutdown of TCP connection", "signal" : "UNUSED" } def __init__(self, starturl, postbody = "", connectionclass = TCPClient): #print "SingleShotHTTPClient.__init__()" super(SingleShotHTTPClient, self).__init__() self.tcpclient = None self.httpparser = None self.requestqueue = [] self.starturl = starturl self.connectionclass = connectionclass self.postbody = postbody #print "Start url: " + starturl def formRequest(self, url): """Craft a HTTP request string for the supplied url""" splituri = splitUri(url) host = splituri["uri-server"] if splituri.has_key("uri-port"): host += ":" + splituri["uri-port"] splituri["request"] = [] if self.postbody == "": splituri["request"].append("GET " + splituri["raw-uri"] + " HTTP/1.1\r\n") else: splituri["request"].append("POST " + splituri["raw-uri"] + " HTTP/1.1\r\n") splituri["request"].append("Content-Length: " + str(len(self.postbody)) + "\r\n") splituri["request"].append("Host: " + host + "\r\n") splituri["request"].append("User-agent: Kamaelia HTTP Client 0.3 (RJL)\r\n") splituri["request"].append("Connection: Keep-Alive\r\n") # keep-alive is a work around for lack of shutdown notification in TCPClient splituri["request"].append("\r\n") splituri["request"] = [string.join(splituri["request"], "")] # might improve performance by sending more together if self.postbody != "": splituri["request"].append(self.postbody) return splituri def makeRequest(self, request): """Connect to the remote HTTP server and send request""" self.tcpclient = None self.httpparser = None port = intval(request.requestobject.get("uri-port", "")) if port == None: port = 80 self.tcpclient = self.connectionclass(request.requestobject["uri-server"], port) self.httpparser = HTTPParser(mode="response") self.link( (self, "_tcpoutbox"), (self.tcpclient, "inbox") ) self.link( (self, "_tcpsignal"), (self.tcpclient, "control") ) self.link( (self.tcpclient, "signal"), (self, "_tcpcontrol") ) self.link( (self.tcpclient, "outbox"), (self.httpparser, "inbox") ) #incoming TCP data -> HTTPParser directly self.link( (self, "_parsersignal"), (self.httpparser, "control") ) self.link( (self.httpparser, "outbox"), (self, "_parserinbox") ) self.link( (self.httpparser, "signal"), (self, "_parsercontrol") ) self.addChildren( self.tcpclient, self.httpparser ) self.tcpclient.activate() self.httpparser.activate() self.response = "" if isinstance(request.requestobject["request"], str): self.send(request.requestobject["request"], "_tcpoutbox") else: for part in request.requestobject["request"]: self.send(part, "_tcpoutbox") def shutdownKids(self): """Close TCP connection and HTTP parser""" if self.tcpclient != None and self.httpparser != None: self.send(producerFinished(), "_tcpsignal") self.send(shutdown(), "_parsersignal") self.removeChild(self.tcpclient) self.removeChild(self.httpparser) self.tcpclient = None self.httpparser = None def handleRedirect(self, header): """Check for a redirect response and queue the fetching the page it points to if it is such a response. Returns true if it was a redirect page and false otherwise.""" if header["responsecode"] == "302" or header["responsecode"] == "303" or header["responsecode"] == "307": # location header gives the redirect URL newurl = header["headers"].get("location", "") if newurl != "": redirectedrequest = HTTPRequest(self.formRequest(newurl), self.currentrequest.redirectcount + 1) self.requestqueue.append(redirectedrequest) return True else: return False # do something equivalent to what we'd do for 404 else: return False def main(self): """Main loop.""" self.requestqueue.append(HTTPRequest(self.formRequest(self.starturl), 0)) while self.mainBody(): #print "SingleShotHTTPClient.main" yield 1 self.send(producerFinished(self), "signal") yield 1 return def mainBody(self): """Called repeatedly by main loop. Checks inboxes and processes messages received. Start the fetching of the new page if the current one is a redirect and has been completely fetched.""" self.send("SingleShotHTTPClient.mainBody()", "debug") while self.dataReady("_parserinbox"): msg = self.recv("_parserinbox") if isinstance(msg, ParsedHTTPHeader): self.send("SingleShotHTTPClient received a ParsedHTTPHeader on _parserinbox", "debug") # if the page is a redirect page if not self.handleRedirect(msg.header): if msg.header["responsecode"] == "200": self.send(msg, "outbox") # if not redirecting then send the response on else: #treat as not found pass elif isinstance(msg, ParsedHTTPBodyChunk): self.send("SingleShotHTTPClient received a ParsedHTTPBodyChunk on _parserinbox", "debug") if len(self.requestqueue) == 0: # if not redirecting then send the response on self.send(msg, "outbox") elif isinstance(msg, ParsedHTTPEnd): self.send("SingleShotHTTPClient received a ParsedHTTPEnd on _parserinbox", "debug") if len(self.requestqueue) == 0: # if not redirecting then send the response on self.send(msg, "outbox") self.shutdownKids() return 1 while self.dataReady("_parsercontrol"): temp = self.recv("_parsercontrol") self.send("SingleShotHTTPClient received something on _parsercontrol", "debug") while self.dataReady("_tcpcontrol"): msg = self.recv("_tcpcontrol") self.send(msg, "_parsersignal") while self.dataReady("control"): msg = self.recv("control") if isinstance(msg, shutdownMicroprocess) or isinstance(msg, shutdown): self.shutdownKids() return 0 # if we're not currently downloading a page if self.tcpclient == None: # then either we've finished or we should download the next URL (if we've been redirected) if len(self.requestqueue) > 0: self.currentrequest = self.requestqueue.pop(0) if self.currentrequest.redirectcount == 3: # 3 redirects is excessive, give up, we're probably in a loop anyway return 0 else: self.makeRequest(self.currentrequest) else: return 0 self.pause() return 1 def makeSSHTTPClient(paramdict): """Creates a SingleShotHTTPClient for the given URL. Needed for Carousel.""" return SingleShotHTTPClient(paramdict.get("url",""), paramdict.get("postbody","")) class SimpleHTTPClient(component): Inboxes = { "inbox" : "URLs to download", "control" : "Shut me down", "_carouselready" : "Receive NEXT when carousel has completed a request", "_carouselinbox" : "Data from SingleShotHTTPClient via Carousel" } Outboxes = { "outbox" : "Requested file's data string", "signal" : "Signal I have shutdown", "_carouselnext" : "Create a new SingleShotHTTPClient", "_carouselsignal" : "Shutdown the carousel", "debug" : "Information to aid debugging" } def __init__(self): """Create and link to a carousel object""" super(SimpleHTTPClient, self).__init__() #AttachConsoleToDebug(self) self.debug("SimpleHTTPClient.__init__()") self.carousel = Carousel(componentFactory=makeSSHTTPClient) self.addChildren(self.carousel) self.link((self, "_carouselnext"), (self.carousel, "next")) self.link((self, "_carouselsignal"), (self.carousel, "control")) self.link((self.carousel, "outbox"), (self, "_carouselinbox")) self.link((self.carousel, "requestNext"), (self, "_carouselready")) self.carousel.activate() def cleanup(self): """Destroy child components and send producerFinished when we quit.""" self.debug("SimpleHTTPClient.cleanup()") self.send(producerFinished(self), "_carouselsignal") #shutdown() not currently supported by Carousel self.send(producerFinished(self), "signal") self.removeChild(self.carousel) self.unpause() def debug(self, msg): self.send(msg, "debug") def main(self): """Main loop.""" self.debug("SimpleHTTPClient.main()\n") finished = False while not finished: yield 1 self.debug("SimpleHTTPClient.main1") while self.dataReady("inbox"): paramdict = self.recv("inbox") if isinstance(paramdict, str): paramdict = { "url": paramdict } self.debug("SimpleHTTPClient received url " + paramdict.get("url","") + "\n") self.send(paramdict, "_carouselnext") filebody = "" carouselbusy = True while carouselbusy: yield 1 #print "SimpleHTTPClient.main2" while self.dataReady("_carouselinbox"): msg = self.recv("_carouselinbox") if isinstance(msg, ParsedHTTPBodyChunk): filebody += msg.bodychunk while self.dataReady("control"): msg = self.recv("control") if isinstance(msg, producerFinished): finished = True elif isinstance(msg, shutdown): self.cleanup() return while self.dataReady("_carouselready"): msg = self.recv("_carouselready") carouselbusy = False self.pause() self.send(filebody, "outbox") filebody = "" while self.dataReady("control"): msg = self.recv("control") if isinstance(msg, producerFinished): finished = True elif isinstance(msg, shutdown): self.cleanup() return self.pause() self.debug("eoml in SimpleHTTPClient") self.cleanup() yield 1 return if __name__ == '__main__': from Kamaelia.Util.PipelineComponent import pipeline from Kamaelia.Util.Console import ConsoleReader, ConsoleEchoer from Kamaelia.File.Writing import SimpleFileWriter # type in a URL e.g. http://www.google.co.uk and have that page saved to disk pipeline( ConsoleReader(">>> ", ""), SimpleHTTPClient(), SimpleFileWriter("downloadedfile.txt"), ).run()