#A utility script to fetch all of the mp3s linked to by the given #Web page. # #Usage: python fetchMp3s.py #where is an absolute URL pointing to a Web page and # is the location for the fetched files. Currently, #if points to a directory, you must ensure that it ends with #a "/" for this script to function properly. # #David Faden #dfaden@iastate.edu #Spring 2004 from HTMLParser import HTMLParser from os.path import basename, expanduser, join from sets import Set from sys import argv from urllib import urlopen, urlretrieve from urlparse import urljoin class Mp3Fetcher(HTMLParser): """I fetch all of the MP3s from the given page, storing them in the given directory.""" def __init__(self, baseUrl, saveDirectory): HTMLParser.__init__(self) self.baseUrl = baseUrl self.saveDirectory = expanduser(saveDirectory) self.fetchedUrls = Set() self.feed(urlopen(baseUrl).read()) def handle_starttag(self, tag, attrs): if ("a" == tag): href = dict(attrs).get("href", "") if href.endswith(".mp3"): #This method seems to require that if baseUrl refers #to a directory, then it should end with a slash. #Otherwise, the last directory is dropped. #XXX! How should we fix this? mp3Url = urljoin(self.baseUrl, href) if mp3Url not in self.fetchedUrls: self.fetchedUrls.add(mp3Url) localFilename = join(self.saveDirectory, basename(href)) urlretrieve(mp3Url, localFilename) print "Fetched ", href if __name__ == '__main__': if len(argv) == 3: Mp3Fetcher(argv[1], argv[2]) else: print "Usage: python fetchMp3s.py "