from pyquery import PyQuery as pq
import urllib.request
import pymysql
import uuid
conn
= pymysql.connect(host=‘127.0.0.1‘, user=“root“, passwd=“123456“, db=“test“, port=3306, charset=“utf8“)
cur = conn.cursor()
cur.execute(“select * from user“)
#获取数据
users = cur.fetchall()
for i in range(len(users)):
print(users[i])
#获取原码
def get_content(page):
url =‘https://saudi.souq.com/sa-en/mobile-phone-accessories/l/?rpp=32&_=1550499488459&sortby=sr§ion=2&page=‘+ str(page)
a = urllib.request.urlopen(url)#打开网址
html = a.read().decode(‘utf-8‘)#读取源代码并转为unicode
return html
def get(html):
doc = pq(html)
items = doc(‘.img-link.quickViewAction.sPrimaryLink‘)
return items
#多页处理,下载到文件
for j in range(1,3000):
print(“正在爬取第“+str(j)+“页数据…“)
html = get_content(j) #调用获取网页原码
#for i in get(html):
for i in get(html):
prodouct_link = pq(i).attr(‘href‘)
# 防止有的页面,请求没反应,程序停止
try:
doc = pq(url=prodouct_link)
title = doc(‘.product-title>h1‘).text()
price = doc(‘.price.is.sk-clr1‘).text()
stock = doc(‘.txtcolor-alert.xleft>span‘).text()
color = doc(‘span.connection.title‘).text()
shop_name = doc(‘.unit-seller-link>a>b‘).text()
sales = doc(‘.show-for-medium.bold-text‘).text()
image = doc(‘.img-bucket>img‘).attr(“src“)
prodouct_id = str(uuid.uuid1())
sql
= “insert into shop (product_id, product_name,product_link,product_seller,product_price,product_sales,product_stock,product_image) values (%s, %s, %s, %s,%s, %s, %s, %s)“
try:
count =cur.execute(sql, [prodouct_id, title, prodouct_link, shop_name, price, sales, stock, image])
# 判断是否成功
if count > 0:
print(“添加数据成功!
“)
# 提交事务
conn.commit()
except:
pass
except:
pass
with open(‘job.txt‘, ‘a‘, encoding=‘utf-8‘) as f:
f.write(prodouct_link+‘
‘)
f.close()
#关闭数据库资源连接
cur.close()
conn.close()