1 利用urllib.request模板要求返回网页文本;
2 从网页文本中利用正则表达式筛选出img src地址(返回一个全部src的列表);
3 图片文件逐一检索或复制;
代码:
运行效果:
附代码1:
import re
import urllib.request
import os
#1 抓取网页
#url = 'http://www.kgc.cn/list'
url = 'http://www.ttpaihang.com/vote/rank.php?voteid=1410&page=2'
req = urllib.request.urlopen(url)
buf = req.read()
req.close()
#2 获取图片地址
i = url.find(\公众/\公众,9) # 本句及下面三句截取url的前半截
url2 = url
if i > 0 :
....url2 = url[:i]
#buf = buf.decode('UTF-8')
buf = buf.decode('gb2312')
#listurl = re.findall(r'http:.[^\公众]+\.jpg',buf)
listurl = re.findall(r'img src=.[^\公众]+\.jpg',buf)
for i in range(len(listurl)):.... # 把字符img src=\"大众去掉
....listurl[i]=listurl[i].replace('img src=\"大众',\公众\"大众)
....if not re.match(\"大众http\公众,listurl[i]):
........listurl[i]=url2 + listurl[i]
....print(listurl[i])
#3 抓取图片并保存到本地
i = 0
fpath = \公众D:\\pic2\\\"大众
if not os.path.isdir(fpath):
....os.mkdir(fpath)
for url in listurl:
....f=open(fpath + str(i)+'.jpg','wb')
....req = urllib.request.urlopen(url)
....buf = req.read()
....f.write(buf)
....f.close()
....i+=1
........
附代码2(写成函数的形式)
import re .... .... .... .... # 正则表达式
import urllib.request .... .... # 从做事器要求返回资源
import os .... .... .... .... # 文件和目录操作
import socket .... .... .... .... # 套接字操作
#socket.setdefaulttimeout(20)....................# 设置socket层的超时时间为20秒
def gethtml(url): #1 抓取网页html内容
....with urllib.request.urlopen(url) as req:
........buf = req.read()
........return buf
def getImg(buf,codec,fpath): #2 从html筛选图片地址到list
....i = url.find(\公众/\"大众,9)............................ # 本句及下面三句截取url的前半截
....url2 = url
....if i > 0 :
........url2 = url[:i]
....buf = buf.decode(codec)
....
....reg = r'img src=\公众(.+?\.jpg)\"大众'....#正则表达式,得到图片地址
....#listurl = re.findall(r'http:.[^\公众]+\.jpg',buf)
....#listurl = re.findall(r'img src=.[^\"大众]+\.jpg',buf)
....listurl = re.findall(reg,buf)
....print(\"大众准备下载图片数量:\公众,len(listurl))
....for i in range(len(listurl)):................
........#listurl[i]=listurl[i].replace('img src=\公众',\公众\公众) # 把字符img src=\公众去掉
........if not re.match(\"大众http\公众,listurl[i]):
............listurl[i]=url2 + listurl[i]
........print(listurl[i])
............#3 抓取图片并保存到本地
....i = 0
....
....if not os.path.isdir(fpath):
........os.mkdir(fpath)
....'''
....for imgurl in listurl:
........urllib.request.urlretrieve(imgurl,fpath + str(i)+'.jpg')
........i+=1
....'''#下面的操作办法要快一点
....for imgurl in listurl:
........f=open(fpath + str(i)+'.jpg','wb') # 新建空缺图片文件
........req = urllib.request.urlopen(imgurl) # 获取网页图片文件
........buf = req.read().... .... .... # 读取网站上图片文件内容
........f.write(buf).... .... .... # 将网站上图片内容写入新建的图片文件
........f.close()
........i+=1
# 四处内容须要确认:1 网页url; .... ....2 网页编码UTF-8或gb2312;
#................ 3 图片扩展名jpg或png(两处); 4 保存的文件夹
#url = 'http://www.kgc.cn/list'
url = 'http://www.ttpaihang.com/vote/rank.php?voteid=1410&page=3'
buf = gethtml(url)
#codec = 'UTF-8'
codec = 'gb2312'
fpath = \"大众D:\\pic4\\\公众
print(getImg(buf,codec,fpath))
-End-