#A utility script to fetch all of the mp3s linked to by the given
#Web page.
#
#Usage: python fetchMp3s.py
#where is an absolute URL pointing to a Web page and
# is the location for the fetched files. Currently,
#if points to a directory, you must ensure that it ends with
#a "/" for this script to function properly.
#
#David Faden
#dfaden@iastate.edu
#Spring 2004
from HTMLParser import HTMLParser
from os.path import basename, expanduser, join
from sets import Set
from sys import argv
from urllib import urlopen, urlretrieve
from urlparse import urljoin
class Mp3Fetcher(HTMLParser):
"""I fetch all of the MP3s from the given page, storing them in
the given directory."""
def __init__(self, baseUrl, saveDirectory):
HTMLParser.__init__(self)
self.baseUrl = baseUrl
self.saveDirectory = expanduser(saveDirectory)
self.fetchedUrls = Set()
self.feed(urlopen(baseUrl).read())
def handle_starttag(self, tag, attrs):
if ("a" == tag):
href = dict(attrs).get("href", "")
if href.endswith(".mp3"):
#This method seems to require that if baseUrl refers
#to a directory, then it should end with a slash.
#Otherwise, the last directory is dropped.
#XXX! How should we fix this?
mp3Url = urljoin(self.baseUrl, href)
if mp3Url not in self.fetchedUrls:
self.fetchedUrls.add(mp3Url)
localFilename = join(self.saveDirectory, basename(href))
urlretrieve(mp3Url, localFilename)
print "Fetched ", href
if __name__ == '__main__':
if len(argv) == 3:
Mp3Fetcher(argv[1], argv[2])
else:
print "Usage: python fetchMp3s.py "