#!/usr/bin/python """ Uses the USGS's web services to download all data from a given dataset covering a given area. Requires the Suds and BeautifulSoup python modules and wget. The program needs a product key to select a dataset. Look for product keys at http://ags.cr.usgs.gov/index_service/Index_Service_SOAP.asmx . After the first run for a dataset, the program will store all of the download URLs for the dataset in a file in the current directory. A URL is removed from the file for every successful download. If you want to download data from the same dataset but a different area, you will need to first delete the URL file. """ import re, sys, os.path, subprocess, urllib2, time from optparse import OptionParser from suds.client import Client from BeautifulSoup import BeautifulStoneSoup def download_urls(product, left, bottom, right, top): validation = Client('http://extract.cr.usgs.gov/requestValidationService/wsdl/RequestValidationService.wsdl') raw_response = validation.service.processAOI("\ \ \ \ {0}\ {1}\ {2}\ {3}\ \ \ \ \ {4}\ \ 250\ \ \ \ ".format(top, bottom, left, right, product)) # The XML response isn't proper XML, because ampersands in the URLs aren't # quoted. BeautifulSoup, by default, turns "&foo=bar" into "&foo;=bar", # when we want "&foo=bar", so we need to add some markupMassage. # Really, though, we need the final product to just have "&foo=bar", which # is accomplished by the convertEntities parameter. soup = BeautifulStoneSoup(raw_response, markupMassage=[(re.compile('&'), lambda match: '&')], convertEntities=BeautifulStoneSoup.XML_ENTITIES) if soup.status.text != 'true': print "Error." print soup.prettify() exit(1) urls = [ u.text for u in soup.findAll('download_url') ] urls.sort() return urls def read_urls_from_file(product): urls = [] with open(product, 'r') as f: urls = [ u.strip() for u in f.readlines() ] return urls def write_urls_to_file(product, urls): with open(product, 'w') as f: f.writelines([ u + '\n' for u in urls ]) def download_url(url): print '------------------------------------------------------------' response = urllib2.urlopen(url).read() id = BeautifulStoneSoup(response).find('ns:return').text print id status = -1 while status < 400: if status != -1: time.sleep(30) response = urllib2.urlopen('http://extract.cr.usgs.gov/axis2/services/DownloadService/getDownloadStatus?downloadID=' + id).read() message = BeautifulStoneSoup(response, markupMassage=[(re.compile(' \n'), lambda match: ' ')]).find('ns:return').text print message match = re.search('^(\d+),', message) if match: status = int(match.group(1)) else: status = -2 subprocess.call(['wget', '--trust-server-names', 'http://extract.cr.usgs.gov/axis2/services/DownloadService/getData?downloadID=' + id]) response = urllib2.urlopen('http://extract.cr.usgs.gov/axis2/services/DownloadService/setDownloadComplete?downloadID=' + id).read() print BeautifulStoneSoup(response, markupMassage=[(re.compile(' \n'), lambda match: ' ')]).find('ns:return').text if __name__ == '__main__': parser = OptionParser(usage="Usage: %prog [--bbox=LEFT,BOTTOM,RIGHT,TOP] PRODUCT") parser.add_option('-b', '--bbox', dest="bbox", help="Bounding box for the product query. Only needed if no URLs have been downloaded already.") (options, args) = parser.parse_args() try: product = args[0] except IndexError: print "You must specify a product to download." exit(1) if os.path.exists(product): urls = read_urls_from_file(product) else: if not options.bbox: print "No URLs have been downloaded. You must specify a bounding box." exit(1) (l, b, r, t) = [ float(n) for n in options.bbox.split(',') ] urls = download_urls(product, l, b, r, t) write_urls_to_file(product, urls) try: while True: download_url(urls.pop()) write_urls_to_file(product, urls) except IndexError: pass