爬虫实战——天眼查企业信息

# coding = utf-8

import urllib.request

import re

from bs4 import BeautifulSoup

import pandas as pd

import os

import json

import numpy as np

import time

class SpiderSupplier:

# 初始化

def __init__(self, company):

# 获取天眼查网址和企业id

url = 'https://www.tianyancha.com/search?key={}'.format(company)

self.url = urllib.request.quote(url, safe=";/?:@&=+$,", encoding='utf-8')

for i in range(10):

try:

self.company_id = \

self.get_soup(self.url).find_all("a", class_="index_alink__zcia5 link-click")[0].get('href').split('/')[-1]

print(company, " company_id 获取" + " 第%d次连接中····"%(i+1))

break

except:

print(company, " company_id 无法获取" + " 第%d次连接失败,正在重连····"%(i+1))

time.sleep(60)

continue

self.info = {}

# 获取指定网址的 soup

def get_soup(self, url):

while True:

try:

cookie = np.random.choice(

['COOKIES']

)

usr_agent = np.random.choice(

[

'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',

]

)

head = {

'User-Agent': usr_agent.encode('utf-8'),

"cookie": cookie.encode('utf-8').decode('latin1')}

request = urllib.request.Request(url, headers=head)

break

except:

print("url", " 连接失败,正在重连····")

time.sleep(5)

continue

request.encoding = 'utf-8'

response = urllib.request.urlopen(request)

html = response.read().decode('utf-8')

soup = BeautifulSoup(html, 'html.parser')

return soup

# 判断是否空值

def get_else(self, rule):

try:

return rule

except:

return ''

# 文字和数字划分

def split_text_num(self, s):

for idx in range(len(s)):

if s[idx].isdigit():

return idx

return len(s)

# 获取基本信息

def get_business_info(self, soup, info={}):

text = json.loads(soup.find_all('script', {'id':'__NEXT_DATA__'})[0].text)['props']['pageProps']['dehydratedState']['queries'][0]['state']['data']['data']

for i in ['regStatus',

'emailList',

'companyShowBizTypeName',

'regCapitalLabel',

'companyProfilePlainText',

'approvedTime',

'industry2017',

'businessScope',

'taxNumber',

'regCapitalCurrency',

'regCapitalAmount',

'taxQualification',

'name',

'baseInfo',

'regCapital',

'staffNumRange',

'industry',

'legalPersonName',

'regNumber',

'creditCode',

'fromTime',

'socialStaffNum',

'companyOrgType',

'taxAddress',

'actualCapital',

'estiblishTime',

'taxBankAccount',

'regLocation']:

try:

if 'Time' in i:

info[i] = time.strftime("%Y-%m-%d", time.localtime(int(text[i]/1000)))

continue

info[i] = text[i]

except:

info[i] = ''

return info

def get_risk_info(self, soup, info):

# 获取风险信息

for i in soup.find_all("div", {"class": 'Risk_risk-item__G6j6A'}):

i_text = i.text

if i_text[:4]:

info[i_text[:4]] = self.get_else(i_text[4:])

return info

def get_manage_info(self, soup, info):

# 获取管理信息

for i in soup.find_all('a', {'class': 'index_tag-nav-item__JZafL'}):

i_text = i.text

if i_text != '':

idx = self.split_text_num(i_text)

info[i_text[:idx]] = self.get_else(i_text[idx:])

return info

def get_sifa_info(self, soup, info):

# 获取司法信息

for i in soup.find_all("a", {"class": 'index_tag-nav-item__JZafL'}):

i_text = i.text

if i_text != '':

idx = self.split_text_num(i_text)

info[i_text[:idx]] = self.get_else(i_text[idx:])

return info

# 运行

def run(self):

# 增加工商信息

business_info = self.get_soup('https://www.tianyancha.com/company/{}'.format(self.company_id))

self.info = self.get_business_info(business_info, self.info)

# 增加风险信息

risk_soup = self.get_soup('https://www.tianyancha.com/company/{}/jingxian'.format(self.company_id))

self.info = self.get_risk_info(risk_soup, self.info)

# 增加经营信息

run_soup = self.get_soup('https://www.tianyancha.com/company/{}/jingzhuang'.format(self.company_id))

self.info = self.get_manage_info(run_soup, self.info)

# 增加税费信息

tax_soup = self.get_soup(

'https://capi.tianyancha.com/cloud-business-state/v3/ar/taxcred?gid={}&pageSize=10&pageNum=1'.format(

self.company_id))

tax_info = json.loads(str(tax_soup))

try:

self.info['税务评级'] = tax_info['data']['items'][0]['grade']

except:

self.info['税务评级'] = ''

# 增加资质信息

cet_soup = self.get_soup(

'https://capi.tianyancha.com/cloud-business-state/certificate/list?graphId={}&pageSize=10&pageNum=1&type='.format(

self.company_id))

cet_info = json.loads(str(cet_soup))

try:

self.info['资格证书'] = [i['certificateName'] + ":" + i['certNo'] for i in cet_info['data']['resultList'] if

i['endDate'] > "2023-05-01"]

except:

self.info['资格证书'] = ''

# 增加建筑资质信息

qual_soup = self.get_soup(

'https://capi.tianyancha.com/cloud-company-background/construct/getQualificationList.json?gid={}&pageNum=1&pageSize=10&type='.format(

self.company_id))

try:

qual_info = [i['certificateNum'] + '-' + i['qualificationName']

for i in json.loads(str(qual_soup))['data']['result']]

self.info['建筑资质'] = qual_info

except:

self.info['建筑资质'] = ''

# 增加司法信息

sifa_soup = self.get_soup('https://www.tianyancha.com/company/{}/sifa'.format(self.company_id))

self.info = self.get_sifa_info(sifa_soup, self.info)

return self.info

if __name__ == '__main__':

df = pd.read_excel('客户清单pd.xlsx', index_col=0)

with open('info.json', 'w') as f:

json.dump({'西安大地工程检测有限公司':"1"}, f)

fail_list = []

for idx in df['签约对方'].index:

try:

print('*'*20, idx,'-', df['签约对方'][idx], '*'*20)

s = SpiderSupplier(df['签约对方'][idx])

a = s.run()

print(idx, '-', df['签约对方'][idx], '网址解析成功')

except:

print(idx, '-', df['签约对方'][idx], '网址解析错误')

fail_list.append(df['签约对方'][idx])

a = None

with open('info.json', 'r') as f:

data = json.load(f)

data[df['签约对方'][idx]] = a

with open('info.json', 'w') as f:

json.dump(data, f)

# 失败抓取重新获取

for idx in range(len(fail_list)):

try:

print('*'*20, idx,'-', fail_list[idx], '*'*20)

s = SpiderSupplier(fail_list[idx])

a = s.run()

print(idx, '-', fail_list[idx], '网址解析成功')

except:

print(idx, '-', fail_list[idx], '网址解析错误')

a = None

with open('info.json', 'r') as f:

data = json.load(f)

data[fail_list[idx]] = a

with open('info.json', 'w') as f:

json.dump(data, f)

好文链接

评论可见,请评论后查看内容,谢谢!!!评论后请刷新页面。