爬虫实战——天眼查企业信息
# coding = utf-8
import urllib.request
import re
from bs4 import BeautifulSoup
import pandas as pd
import os
import json
import numpy as np
import time
class SpiderSupplier:
# 初始化
def __init__(self, company):
# 获取天眼查网址和企业id
url = 'https://www.tianyancha.com/search?key={}'.format(company)
self.url = urllib.request.quote(url, safe=";/?:@&=+$,", encoding='utf-8')
for i in range(10):
try:
self.company_id = \
self.get_soup(self.url).find_all("a", class_="index_alink__zcia5 link-click")[0].get('href').split('/')[-1]
print(company, " company_id 获取" + " 第%d次连接中····"%(i+1))
break
except:
print(company, " company_id 无法获取" + " 第%d次连接失败,正在重连····"%(i+1))
time.sleep(60)
continue
self.info = {}
# 获取指定网址的 soup
def get_soup(self, url):
while True:
try:
cookie = np.random.choice(
['COOKIES']
)
usr_agent = np.random.choice(
[
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
]
)
head = {
'User-Agent': usr_agent.encode('utf-8'),
"cookie": cookie.encode('utf-8').decode('latin1')}
request = urllib.request.Request(url, headers=head)
break
except:
print("url", " 连接失败,正在重连····")
time.sleep(5)
continue
request.encoding = 'utf-8'
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
soup = BeautifulSoup(html, 'html.parser')
return soup
# 判断是否空值
def get_else(self, rule):
try:
return rule
except:
return ''
# 文字和数字划分
def split_text_num(self, s):
for idx in range(len(s)):
if s[idx].isdigit():
return idx
return len(s)
# 获取基本信息
def get_business_info(self, soup, info={}):
text = json.loads(soup.find_all('script', {'id':'__NEXT_DATA__'})[0].text)['props']['pageProps']['dehydratedState']['queries'][0]['state']['data']['data']
for i in ['regStatus',
'emailList',
'companyShowBizTypeName',
'regCapitalLabel',
'companyProfilePlainText',
'approvedTime',
'industry2017',
'businessScope',
'taxNumber',
'regCapitalCurrency',
'regCapitalAmount',
'taxQualification',
'name',
'baseInfo',
'regCapital',
'staffNumRange',
'industry',
'legalPersonName',
'regNumber',
'creditCode',
'fromTime',
'socialStaffNum',
'companyOrgType',
'taxAddress',
'actualCapital',
'estiblishTime',
'taxBankAccount',
'regLocation']:
try:
if 'Time' in i:
info[i] = time.strftime("%Y-%m-%d", time.localtime(int(text[i]/1000)))
continue
info[i] = text[i]
except:
info[i] = ''
return info
def get_risk_info(self, soup, info):
# 获取风险信息
for i in soup.find_all("div", {"class": 'Risk_risk-item__G6j6A'}):
i_text = i.text
if i_text[:4]:
info[i_text[:4]] = self.get_else(i_text[4:])
return info
def get_manage_info(self, soup, info):
# 获取管理信息
for i in soup.find_all('a', {'class': 'index_tag-nav-item__JZafL'}):
i_text = i.text
if i_text != '':
idx = self.split_text_num(i_text)
info[i_text[:idx]] = self.get_else(i_text[idx:])
return info
def get_sifa_info(self, soup, info):
# 获取司法信息
for i in soup.find_all("a", {"class": 'index_tag-nav-item__JZafL'}):
i_text = i.text
if i_text != '':
idx = self.split_text_num(i_text)
info[i_text[:idx]] = self.get_else(i_text[idx:])
return info
# 运行
def run(self):
# 增加工商信息
business_info = self.get_soup('https://www.tianyancha.com/company/{}'.format(self.company_id))
self.info = self.get_business_info(business_info, self.info)
# 增加风险信息
risk_soup = self.get_soup('https://www.tianyancha.com/company/{}/jingxian'.format(self.company_id))
self.info = self.get_risk_info(risk_soup, self.info)
# 增加经营信息
run_soup = self.get_soup('https://www.tianyancha.com/company/{}/jingzhuang'.format(self.company_id))
self.info = self.get_manage_info(run_soup, self.info)
# 增加税费信息
tax_soup = self.get_soup(
'https://capi.tianyancha.com/cloud-business-state/v3/ar/taxcred?gid={}&pageSize=10&pageNum=1'.format(
self.company_id))
tax_info = json.loads(str(tax_soup))
try:
self.info['税务评级'] = tax_info['data']['items'][0]['grade']
except:
self.info['税务评级'] = ''
# 增加资质信息
cet_soup = self.get_soup(
'https://capi.tianyancha.com/cloud-business-state/certificate/list?graphId={}&pageSize=10&pageNum=1&type='.format(
self.company_id))
cet_info = json.loads(str(cet_soup))
try:
self.info['资格证书'] = [i['certificateName'] + ":" + i['certNo'] for i in cet_info['data']['resultList'] if
i['endDate'] > "2023-05-01"]
except:
self.info['资格证书'] = ''
# 增加建筑资质信息
qual_soup = self.get_soup(
'https://capi.tianyancha.com/cloud-company-background/construct/getQualificationList.json?gid={}&pageNum=1&pageSize=10&type='.format(
self.company_id))
try:
qual_info = [i['certificateNum'] + '-' + i['qualificationName']
for i in json.loads(str(qual_soup))['data']['result']]
self.info['建筑资质'] = qual_info
except:
self.info['建筑资质'] = ''
# 增加司法信息
sifa_soup = self.get_soup('https://www.tianyancha.com/company/{}/sifa'.format(self.company_id))
self.info = self.get_sifa_info(sifa_soup, self.info)
return self.info
if __name__ == '__main__':
df = pd.read_excel('客户清单pd.xlsx', index_col=0)
with open('info.json', 'w') as f:
json.dump({'西安大地工程检测有限公司':"1"}, f)
fail_list = []
for idx in df['签约对方'].index:
try:
print('*'*20, idx,'-', df['签约对方'][idx], '*'*20)
s = SpiderSupplier(df['签约对方'][idx])
a = s.run()
print(idx, '-', df['签约对方'][idx], '网址解析成功')
except:
print(idx, '-', df['签约对方'][idx], '网址解析错误')
fail_list.append(df['签约对方'][idx])
a = None
with open('info.json', 'r') as f:
data = json.load(f)
data[df['签约对方'][idx]] = a
with open('info.json', 'w') as f:
json.dump(data, f)
# 失败抓取重新获取
for idx in range(len(fail_list)):
try:
print('*'*20, idx,'-', fail_list[idx], '*'*20)
s = SpiderSupplier(fail_list[idx])
a = s.run()
print(idx, '-', fail_list[idx], '网址解析成功')
except:
print(idx, '-', fail_list[idx], '网址解析错误')
a = None
with open('info.json', 'r') as f:
data = json.load(f)
data[fail_list[idx]] = a
with open('info.json', 'w') as f:
json.dump(data, f)
好文链接
发表评论