商標買賣,信用百度公司商標信息爬取
商標買賣,信用百度公司商標信息爬取
信用百度公司商標信息和圖片爬取,ip代理和動態header沒做,這只是個測試小腳本,可以在這個基礎上繼續修改,小改動后再選擇自己的存儲方式直接存儲就好。
希望對大家有幫助。直接復制粘貼即可使用
import re
import time
import requests
from lxml import etree
import json
import execjs
import uuid,oss2
from requests.packages.urllib3.exceptions import InsecureRequestWarning# 禁用安全請求警告
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)#獲取搜索公司后的結果列表
def get_company_list():company_name = ['徐州靈匠信息科技有限公司']resp = requests.get(url="https://xin.baidu.com/s?q=%s&t=0" % company_name[0],headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3534.4 Safari/537.36"},verify=False)response = resp.content.decode()html = etree.HTML(response)urls = html.xpath('//*[@class="zx-list-wrap"]/div//a[@class="zx-list-item-url"]/@href')for url in urls:get_markinfo(company_name, "https://xin.baidu.com" + url)#獲取組成請求的參數pid,tk
def get_markinfo(company_name, url):resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3534.4 Safari/537.36"},verify=False)text = resp.textresponse = resp.content.decode()html2 = etree.HTML(response)d = html2.xpath('//*[@id="baiducode"]/text()')[0]pid = eval(re.findall(r'"pid":(.*?)\,.*?"defTags"', text, re.S)[0])id1, att = re.findall(r"document\.getElementById\('(.*?)'\)\.getAttribute\('(.*?)'\)", text)[0]tk_func = "function mix(" + re.findall(r'mix\((.*?)\(function', text, re.S)[0]# print(tk_func)tk = re.findall(att + r'="(.*?)"\>', text)[0]# print(tk, d)tk = execjs.compile(tk_func).call('mix', tk, d)# print(tk)time1 = int(time.time() * 1000)url1 = "https://xin.baidu.com/detail/markAjax?pid={}&tot={}&_={}".format(pid, tk, time1)get_company_mark_info(url1, pid, tk, company_name)#發送請求獲取商標
def get_company_mark_info(url, pid, tk, company_name):try:resp1 = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3534.4 Safari/537.36"})data = json.loads(resp1.content.decode('unicode_escape'))pageCount = data['data']['pageCount']if pageCount == 0:print('-------', company_name, '暫未有注冊的商標數據')elif pageCount == 1:print('-------', company_name, '只有一頁數據')for da in data['data']['list']:markName = da['markName'] #商標名markNo = da['markRegNo'] #商標號mareImg = da['markStyle'] #商標markType = da['markType'] #商標類型markValidTime = da['markValidTime'] #有效時間markStatus = da['markStatus'] #注冊方式else:page = data['data']['page']print('-------', company_name, '有多頁數據,當前是第%d頁---' % page)for da in data['data']['list']:markName = da['markName'] #商標名markNo = da['markRegNo'] #商標號mareImg = da['markStyle'] #商標地址#上傳到oss# markeImg = update_img(da['markStyle'])# if markeImg is not None:# print('商標上傳成功')# else:# print('上傳成功')markType = da['markType'] #商標類型markValidTime = da['markValidTime'] #有效時間markStatus = da['markStatus'] #注冊方式print(da)if page is not pageCount:for i in range(2, pageCount + 1):print(i)url1 = "https://xin.baidu.com/detail/markAjax?pid={}&tot={}&_={}&p={}".format(pid, tk, int(time.time() * 1000), i)get_company_mark_info(url1, pid, tk, company_name)else:print('-------', company_name, '商標數據循環輸出完畢')# print(resp1.content.decode('unicode_escape'))except Exception as e:print('get_company_mark_info方法出現錯誤:', e)#商標上傳到oss,使用的是oss存儲圖標,
def update_img(url):account='賬號'key='密碼'uid = uuid.uuid1()auth = oss2.Auth(account, key)bucket = oss2.Bucket(auth, 'http://oss-cn-hangzhou.aliyuncs.com', 'juhe-app')try:input = requests.get(url)path = 'zb_news/%s.jpg' % uid #存儲路徑result = bucket.put_object(path, input)if result.status == 200:return 'http://juhe-app.oss-cn-hangzhou.aliyuncs.com/' + pathreturn Noneexcept:return None#啟動程序
def run():get_company_list()if __name__=='__main__':run()