闲的无事,看着知乎里种种python优点,按捺不住,装起python3.4。
网上找了点爬行图片的代码,修改至兼容3.4,成功爬行指定url所有jpg图片,代码段如下:
import os import urllib import urllib.request import re #爬行图片 download_path = os.path.dirname(os.path.abspath(__file__)) class spider(object):def __init__(self, url):self.url = urldef parse(self,content):pattern = 'src="(http://.*\.jpg)\s*"'matchs = re.findall(pattern,content,re.M)return matchs def downloads(self,urls):d_path = download_path + "/test"if not os.path.exists(d_path):os.mkdir(d_path)for url in urls:filename = url.split("/")[-1]print (url)print ("Downloads %s" % (filename))output = "%s/%s" % (d_path, filename)urllib.request.urlretrieve(url,output)def run(self):d_url = self.urlfd = urllib.request.urlopen(d_url)try:content = fd.read()content = content.decode("UTF-8")urls = self.parse(content)self.downloads(urls)finally:fd.close() if __name__ == "__main__":sp = spider("http://news.cnfol.com/img/20150814/17638.shtml")sp.run()