在Ubuntu 16.04下爬取淘宝模特信息并自动保存图片_Linux系统教程

环境：Ubuntu 16.04

工具：python 3.5+，scrapy1.1，pycharm

代码如下：

import scrapy, re, os, lxml, urllib.request

from scrapy.http import Request

from bs4 import BeautifulSoup

class TaobaoMMSpider(scrapy.Spider):

name = 'TaobaoMM'

start_urls = ['https://mm.taobao.com/json/request_top_list.htm?page=1']

#在这里输入你要保存图片的地址

mainposition = '/media/liuyu/0009F608000B7B40/TaobaoMM/'

# 处理第一个网页,获取总页数

def parse(self, response):

content = BeautifulSoup(response.text, "lxml")

totalpage = content.find('input', id="J_Totalpage").get('value')

url = 'https://mm.taobao.com/json/request_top_list.htm?page='

for i in range(1):

yield Request(url + str(i+1), callback=self.everypage)

# 对每一页的网页进行处理，获取每位model的网页

def everypage(self, response):

content = BeautifulSoup(response.text, "lxml")

modelinfo = content.find_all('div', class_="personal-info")

for i in modelinfo:

name = i.find('a', class_="lady-name").string

seconddir = self.mainposition + name

os.mkdir(self.mainposition + str(name))

age = i.find('strong').string

modelurl = 'https:' + i.find('a', class_="lady-name").get('href')

yield Request(modelurl, callback=self.infocard, meta={'age': age, 'seconddir': seconddir})

# 处理模特卡界面,获取模特id,构造获取model信息的json链接

def infocard(self, response):

content = BeautifulSoup(response.text, "lxml")

modelid = content.find('input', id="J_MmuserId").get('value')

infourl = 'https://mm.taobao.com/self/info/model_info_show.htm?user_id=' + modelid

albumurl = 'https:' + content.find('ul', class_="mm-p-menu").find('a').get('href')

yield Request(infourl, callback=self.infoprocess,

meta={'seconddir': response.meta['seconddir'], 'albumurl': albumurl, 'age': response.meta['age']})

# 处理model的json网页信息，获取名字等信息，然后跳转至相册界面

def infoprocess(self, response):

seconddir = response.meta['seconddir']

albumurl = response.meta['albumurl']

age = response.meta['age']

content = BeautifulSoup(response.text, "lxml")

modelinfo = content.find('ul', class_="mm-p-info-cell clearfix")

info = modelinfo.find_all('li')

name = info[0].find('span').string

with open(seconddir + '/' + name + '.txt', 'w')as file:

file.write('age' + age + '\n')

for i in range(6):

file.write(info[i].find('span').string.replace("\xa0", "") + '\n')

for i in range(2):

file.write(info[i+7].find('p').string + '\n')

file.write('BWH:' + info[9].find('p').string + '\n')

file.write('cup_size:' + info[10].find('p').string + '\n')

file.write('shoe_size:' + info[11].find('p').string + '\n')

file.close()

yield Request(albumurl, callback=self.album, meta={'seconddir': response.meta['seconddir']})

# 处理相册框架界面，获取model的ID，构造相册列表的json请求链接

def album(self, response):

content = BeautifulSoup(response.text, "lxml")

modelid = content.find('input', id="J_userID").get('value')

url = 'https://mm.taobao.com/self/album/open_album_list.htm?_charset=utf-8&user_id%20=' + modelid

yield Request(url, callback=self.allimage, meta={'url': url, 'seconddir': response.meta['seconddir']})

# 处理相册信息页面，获取总页数

def allimage(self, response):

url = response.meta['url']

content = BeautifulSoup(response.text, "lxml")

page = content.find('input').get('value')

for i in range(int(page)):

yield Request(url + '&page=' + str(i+1), callback=self.image, meta={'seconddir': response.meta['seconddir']})

# 对相册每一页进行处理，获取相册名，对每一个相册进行访问

def image(self, response):

seconddir = response.meta['seconddir']

content = BeautifulSoup(response.text,"lxml")

albuminfo = content.find_all('div', class_="mm-photo-cell-middle")

for i in albuminfo:

albumname = i.find('h4').a.string.replace(" ","")

albumname=albumname.replace("\n","")

thirddir = seconddir + '/' + albumname

os.mkdir(thirddir)

url = i.find('h4').a.get('href')

pattern = re.compile('.*?user_id=(.*?)&album_id=(.*?)&album_flag')

item = re.findall(pattern, url)

for item in item:

modelid = item[0]

albumid = item[1]

imageurl = 'https://mm.taobao.com/album/json/get_album_photo_list.htm?user_id=' + modelid + '&album_id=' + albumid + '&page='

yield Request(imageurl, callback=self.imageprocess, meta={'url': imageurl, 'thirddir': thirddir})

# 对相册页面进行处理，获取相册总页数

def imageprocess(self, response):

url = response.meta['url']

content = response.text

pattern = re.compile('.*?"totalPage":"(.*?)"')

item = re.findall(pattern, content)

pagenum = item[0]

for i in range(int(pagenum)):

imageurl = url + str(i+1)

yield Request(imageurl, callback=self.saveimage, meta={'thirddir': response.meta['thirddir']})

# 处理相册页面，获得每一个照片的链接

def saveimage(self, response):

thirddir = response.meta['thirddir']

content = response.text

pattern = re.compile('.*?"picUrl":"(.*?)"')

pattern_2 = re.compile('.*?imgextra/.*?/(.*?)/')

imageurls = re.findall(pattern, content)

for imageurl in imageurls:

imagename_temp=re.findall(pattern_2,imageurl)

imagename=imagename_temp[0]

url = 'https:' + imageurl

print(url)

u = urllib.request.urlopen(url).read()

with open(thirddir + '/' + imagename + '.jpg', 'wb')as file:

file.write(u)

file.close()

运行结果：

本文永久更新地址：http://www.linuxdiyf.com/linux/22748.html

在Ubuntu 16.04下爬取淘宝模特信息并自动保存图片

频道文章

网站推荐文章

推荐教程

热点推荐