红联Linux门户
Linux帮助

在Ubuntu 16.04下爬取淘宝模特信息并自动保存图片

发布时间:2016-07-27 12:17:04来源:linux网站作者:INnoVation-V2
环境:Ubuntu 16.04
工具:python 3.5+,scrapy1.1,pycharm
 
代码如下:
import scrapy, re, os, lxml, urllib.request
from scrapy.http import Request
from bs4 import BeautifulSoup
class TaobaoMMSpider(scrapy.Spider):
name = 'TaobaoMM'
start_urls = ['https://mm.taobao.com/json/request_top_list.htm?page=1']
#在这里输入你要保存图片的地址
mainposition = '/media/liuyu/0009F608000B7B40/TaobaoMM/'
# 处理第一个网页,获取总页数
def parse(self, response):
content = BeautifulSoup(response.text, "lxml")
totalpage = content.find('input', id="J_Totalpage").get('value')
url = 'https://mm.taobao.com/json/request_top_list.htm?page='
for i in range(1):
yield Request(url + str(i+1), callback=self.everypage)
# 对每一页的网页进行处理,获取每位model的网页
def everypage(self, response):
content = BeautifulSoup(response.text, "lxml")
modelinfo = content.find_all('div', class_="personal-info")
for i in modelinfo:
name = i.find('a', class_="lady-name").string
seconddir = self.mainposition + name
os.mkdir(self.mainposition + str(name))
age = i.find('strong').string
modelurl = 'https:' + i.find('a', class_="lady-name").get('href')
yield Request(modelurl, callback=self.infocard, meta={'age': age, 'seconddir': seconddir})
# 处理模特卡界面,获取模特id,构造获取model信息的json链接
def infocard(self, response):
content = BeautifulSoup(response.text, "lxml")
modelid = content.find('input', id="J_MmuserId").get('value')
infourl = 'https://mm.taobao.com/self/info/model_info_show.htm?user_id=' + modelid
albumurl = 'https:' + content.find('ul', class_="mm-p-menu").find('a').get('href')
yield Request(infourl, callback=self.infoprocess,
meta={'seconddir': response.meta['seconddir'], 'albumurl': albumurl, 'age': response.meta['age']})
# 处理model的json网页信息,获取名字等信息,然后跳转至相册界面
def infoprocess(self, response):
seconddir = response.meta['seconddir']
albumurl = response.meta['albumurl']
age = response.meta['age']
content = BeautifulSoup(response.text, "lxml")
modelinfo = content.find('ul', class_="mm-p-info-cell clearfix")
info = modelinfo.find_all('li')
name = info[0].find('span').string
with open(seconddir + '/' + name + '.txt', 'w')as file:
file.write('age' + age + '\n')
for i in range(6):
file.write(info[i].find('span').string.replace("\xa0", "") + '\n')
for i in range(2):
file.write(info[i+7].find('p').string + '\n')
file.write('BWH:' + info[9].find('p').string + '\n')
file.write('cup_size:' + info[10].find('p').string + '\n')
file.write('shoe_size:' + info[11].find('p').string + '\n')
file.close()
yield Request(albumurl, callback=self.album, meta={'seconddir': response.meta['seconddir']})
# 处理相册框架界面,获取model的ID,构造相册列表的json请求链接
def album(self, response):
content = BeautifulSoup(response.text, "lxml")
modelid = content.find('input', id="J_userID").get('value')
url = 'https://mm.taobao.com/self/album/open_album_list.htm?_charset=utf-8&user_id%20=' + modelid
yield Request(url, callback=self.allimage, meta={'url': url, 'seconddir': response.meta['seconddir']})
# 处理相册信息页面,获取总页数
def allimage(self, response):
url = response.meta['url']
content = BeautifulSoup(response.text, "lxml")
page = content.find('input').get('value')
for i in range(int(page)):
yield Request(url + '&page=' + str(i+1), callback=self.image, meta={'seconddir': response.meta['seconddir']})
# 对相册每一页进行处理,获取相册名,对每一个相册进行访问
def image(self, response):
seconddir = response.meta['seconddir']
content = BeautifulSoup(response.text,"lxml")
albuminfo = content.find_all('div', class_="mm-photo-cell-middle")
for i in albuminfo:
albumname = i.find('h4').a.string.replace(" ","")
albumname=albumname.replace("\n","")
thirddir = seconddir + '/' + albumname
os.mkdir(thirddir)
url = i.find('h4').a.get('href')
pattern = re.compile('.*?user_id=(.*?)&album_id=(.*?)&album_flag')
item = re.findall(pattern, url)
for item in item:
modelid = item[0]
albumid = item[1]
imageurl = 'https://mm.taobao.com/album/json/get_album_photo_list.htm?user_id=' + modelid + '&album_id=' + albumid + '&page='
yield Request(imageurl, callback=self.imageprocess, meta={'url': imageurl, 'thirddir': thirddir})
# 对相册页面进行处理,获取相册总页数
def imageprocess(self, response):
url = response.meta['url']
content = response.text
pattern = re.compile('.*?"totalPage":"(.*?)"')
item = re.findall(pattern, content)
pagenum = item[0]
for i in range(int(pagenum)):
imageurl = url + str(i+1)
yield Request(imageurl, callback=self.saveimage, meta={'thirddir': response.meta['thirddir']})
# 处理相册页面,获得每一个照片的链接
def saveimage(self, response):
thirddir = response.meta['thirddir']
content = response.text
pattern = re.compile('.*?"picUrl":"(.*?)"')
pattern_2 = re.compile('.*?imgextra/.*?/(.*?)/')
imageurls = re.findall(pattern, content)
for imageurl in imageurls:
imagename_temp=re.findall(pattern_2,imageurl)
imagename=imagename_temp[0]
url = 'https:' + imageurl
print(url)
u = urllib.request.urlopen(url).read()
with open(thirddir + '/' + imagename + '.jpg', 'wb')as file:
file.write(u)
file.close()
 
运行结果:
在Ubuntu 16.04下爬取淘宝模特信息并自动保存图片
在Ubuntu 16.04下爬取淘宝模特信息并自动保存图片
在Ubuntu 16.04下爬取淘宝模特信息并自动保存图片
在Ubuntu 16.04下爬取淘宝模特信息并自动保存图片
 
本文永久更新地址:http://www.linuxdiyf.com/linux/22748.html