前言 学习github上的项目代码,熟悉知识图谱构建流程。

知识图谱数据源

一部分数据爬取自网站另一部分数据直接使用开源的Tushare,这部分数据可以直接构建实体

抽取网页数据代码

在原项目代码上修改了几处,加了一些注释。

import os

import csv

from lxml import etree

def extract(stockpage_dir, directors_csv):

"""Extract executive of the comnpany or stock

Args:

stockpage_dir: (str) the directory of stock pages

executive_csv: (str) the full path of the CSV file to be saved

"""

pages = map(lambda _: os.path.join(stockpage_dir, _), os.listdir(stockpage_dir))

# os.listdir 获取指定目录中的所有文件名,返回文件名列表,无序

# lambda表达式是一个匿名函数,适合与map、filter等函数结合使用

# os.path.join 拼接文件路径,正常情况下拼接符是单斜杠,但在此处拼接符是双斜杠 "./data/stockpage\\000001.html"

# map()函数接收两个参数,一个是函数,一个是Iterable,map将传入的函数依次作用到序列的每一个元素,并把结果作为新的Iterable返回

pages = filter(lambda _: _.endswith('html'), pages) # 过滤掉非html文件,返回由.html文件组成的新list

headers = ['name', 'gender', 'age', 'code', 'jobs']

with open(directors_csv, 'w', encoding='utf-8',newline="") as file_directors:

file_directors_csv = csv.DictWriter(file_directors, headers) # 创建DictWriter对象

file_directors_csv.writeheader() # 写入表头

for page in pages:

print(page) # the full path of a stock page

file_name = page.split(r'/')[-1] # [".","data","stockpage\\×××.html"],file_name取值为

code = file_name.split('.')[0] # ["stockpage\×××","html"]

code=code.split('\\')[-1] # 转义符\

executives = []

with open(page, 'r', encoding='gbk') as file_page:

content = file_page.read()

html = etree.HTML(content) # etree.HTML用来解析字符串格式的HTML文档对象,将传进去的字符串转变成_Element对象。作为_Element对象,可以方便的使用getparent()、remove()、xpath()等方法。

# 如果想通过xpath获取html源码中的内容,就要先将html源码转换成_Element对象,然后再使用xpath()方法进行解析。

divs = html.xpath('//div[@id="ml_001"]//div[contains(@class, "person_table")]') # xpath返回董事链接源码包含的元素列表

for div in divs:

item = {}

item['name'] = div.xpath('.//thead/tr/td/h3/a/text()')[0].replace(',', '-')

item['jobs'] = div.xpath('.//thead/tr[1]/td[2]/text()')[0].replace(',', '/')

gender_age_education = div.xpath('.//thead/tr[2]/td[1]/text()')[0].split()

try:

item['gender'] = gender_age_education[0]

if item['gender'] not in ('男', '女'):

item['gender'] = 'null' # null for unknown

except IndexError:

item['gender'] = 'null'

try:

item['age'] = gender_age_education[1].strip('岁')

try:

item['age'] = int(item['age'])

except ValueError:

item['age'] = -1 # -1 for unknown

except IndexError:

item['age'] = -1

item['code'] = code

executives.append(item)

# write to csv file

file_directors_csv.writerows(executives)

if __name__ == '__main__':

stockpage_dir = './data/stockpage'

directors_csv = './data/executive_prep.csv'

extract(stockpage_dir, directors_csv)

上述代码利用xpath定位数据源网页文件元素,抽取出实体信息存储在csv文件中,与用requests、bs4包爬取网页信息方法不同。但如果想自己写爬虫代码的话,需要熟悉python语言,并且懂一点html相关的知识。抽取数据截图如下:

数据处理

从网站上爬取的数据虽然存储为csv文件中,但是需要处理,将每种实体单独存在一个csv文件中,这样在neo4j中可以直接生成实体比较方便,实体之间的关系可以抽取出来存在csv中,也可以利用py2neo.NodeMatcher查询实体,再利用Relationship生成关系。

在原项目上为mode为写模式的open函数添加了一个newline参数,这样生成的csv文件中不会包含多余的空行。

import os

import csv

import hashlib

# 加密算法主要用来生成实体id

def get_md5(string):

"""Get md5 according to the string

"""

byte_string = string.encode("utf-8")

md5 = hashlib.md5() # 定义md5对象,用hashlib的md5算法加密数据

md5.update(byte_string) # 对字符串进行加密

result = md5.hexdigest() # 加密字符串

return result

# 生成Person实体对应的csv文件

def build_executive(executive_prep, executive_import):

"""Create an 'executive' file in csv format that can be imported into Neo4j.

format -> person_id:ID,name,gender,age:int,:LABEL

label -> Person

"""

print('Writing to {} file...'.format(executive_import.split('/')[-1]))

with open(executive_prep, 'r', encoding='utf-8') as file_prep, \

open(executive_import, 'w', encoding='utf-8',newline="") as file_import:

file_prep_csv = csv.reader(file_prep, delimiter=',') # 返回reader对象,可以遍历csv中的行,从csv文件中读取的每一行都以字符串列表的形式返回。

file_import_csv = csv.writer(file_import, delimiter=',') # 创建writer对象,传入列表对象

headers = ['person_id:ID', 'name', 'gender', 'age:int', ':LABEL']

file_import_csv.writerow(headers) # 将列表中的元素依次附加在 csv 文件的下一行

for i, row in enumerate(file_prep_csv):

if i == 0 or len(row) < 3: # 跳过表头和有缺失值的行

continue

info = [row[0], row[1], row[2]]

# generate md5 according to 'name' 'gender' and 'age'

info_id = get_md5('{},{},{}'.format(row[0], row[1], row[2]))

info.insert(0, info_id)

info.append('Person') #[加密字符串、name、gender、age、"Person"]

file_import_csv.writerow(info)

print('- done.')

# 生成Company实体对应的csv文件

def build_stock(stock_industry_prep, stock_concept_prep, stock_import):

"""Create an 'stock' file in csv format that can be imported into Neo4j.

format -> company_id:ID,name,code,:LABEL

label -> Company,ST

"""

print('Writing to {} file...'.format(stock_import.split('/')[-1]))

stock = set() # 'code,name'

# 抽取股票代码和公司名称

with open(stock_industry_prep, 'r', encoding='utf-8') as file_prep:

file_prep_csv = csv.reader(file_prep, delimiter=',')

for i, row in enumerate(file_prep_csv):

if i == 0: # 跳过表头

continue

code_name = '{},{}'.format(row[0], row[1].replace(' ', ''))

stock.add(code_name)

# 抽取股票代码和股票名称

with open(stock_concept_prep, 'r', encoding='utf-8') as file_prep:

file_prep_csv = csv.reader(file_prep, delimiter=',')

for i, row in enumerate(file_prep_csv):

if i == 0:

continue

code_name = '{},{}'.format(row[0], row[1].replace(' ', ''))

stock.add(code_name)

with open(stock_import, 'w', encoding='utf-8',newline="") as file_import:

file_import_csv = csv.writer(file_import, delimiter=',')

headers = ['stock_id:ID', 'name', 'code', ':LABEL']

file_import_csv.writerow(headers) # 写入表头

for s in stock:

split = s.split(',')

ST = False # ST flag,标识股票亏损状态

states = ['*ST', 'ST', 'S*ST', 'SST']

info = []

for state in states:

if split[1].startswith(state): # 名票名称前缀中包含ST标识

ST = True

split[1] = split[1].replace(state, '')

info = [split[0], split[1], split[0], 'Company;ST']

break

else:

info = [split[0], split[1], split[0], 'Company']

file_import_csv.writerow(info)

print('- done.')

# 生成Concept实体对应的csv文件

def build_concept(stock_concept_prep, concept_import):

"""Create an 'concept' file in csv format that can be imported into Neo4j.

format -> concept_id:ID,name,:LABEL

label -> Concept

"""

print('Writing to {} file...'.format(concept_import.split('/')[-1]))

with open(stock_concept_prep, 'r', encoding='utf-8') as file_prep, \

open(concept_import, 'w', encoding='utf-8',newline="") as file_import:

file_prep_csv = csv.reader(file_prep, delimiter=',')

file_import_csv = csv.writer(file_import, delimiter=',')

headers = ['concept_id:ID', 'name', ':LABEL']

file_import_csv.writerow(headers)

concepts = set()

for i, row in enumerate(file_prep_csv):

if i == 0:

continue

concepts.add(row[2])

for concept in concepts:

concept_id = get_md5(concept)

new_row = [concept_id, concept, 'Concept']

file_import_csv.writerow(new_row)

print('- done.')

# 生成Industry实体对应的csv文件

def build_industry(stock_industry_prep, industry_import):

"""Create an 'industry' file in csv format that can be imported into Neo4j.

format -> industry_id:ID,name,:LABEL

label -> Industry

"""

print('Write to {} file...'.format(industry_import.split('/')[-1]))

with open(stock_industry_prep, 'r', encoding="utf-8") as file_prep, \

open(industry_import, 'w', encoding='utf-8',newline="") as file_import:

file_prep_csv = csv.reader(file_prep, delimiter=',')

file_import_csv = csv.writer(file_import, delimiter=',')

headers = ['industry_id:ID', 'name', ':LABEL']

file_import_csv.writerow(headers)

industries = set()

for i, row in enumerate(file_prep_csv):

if i == 0:

continue

industries.add(row[2])

for industry in industries:

industry_id = get_md5(industry)

new_row = [industry_id, industry, 'Industry']

file_import_csv.writerow(new_row)

print('- done.')

# 生成 employ_of 关系对应的 csv 文件,title 是 employ_of 关系的属性

def build_executive_stock(executive_prep, relation_import):

"""Create an 'executive_stock' file in csv format that can be imported into Neo4j.

format -> :START_ID,title,:END_ID,:TYPE

person stock

type -> employ_of

"""

with open(executive_prep, 'r', encoding='utf-8') as file_prep, \

open(relation_import, 'w', encoding='utf-8',newline="") as file_import:

file_prep_csv = csv.reader(file_prep, delimiter=',')

file_import_csv = csv.writer(file_import, delimiter=',')

headers = [':START_ID', 'jobs', ':END_ID', ':TYPE']

file_import_csv.writerow(headers)

for i, row in enumerate(file_prep_csv):

if i == 0:

continue

# generate md5 according to 'name' 'gender' and 'age',作为START_ID

start_id = get_md5('{},{},{}'.format(row[0], row[1], row[2]))

end_id = row[3] # code

relation = [start_id, row[4], end_id, 'employ_of']

file_import_csv.writerow(relation)

# 生成industry_of关系对应的csv文件,这个关系没有属性

def build_stock_industry(stock_industry_prep, relation_import):

"""Create an 'stock_industry' file in csv format that can be imported into Neo4j.

format -> :START_ID,:END_ID,:TYPE

stock industry

type -> industry_of

"""

with open(stock_industry_prep, 'r', encoding='utf-8') as file_prep, \

open(relation_import, 'w', encoding='utf-8',newline="") as file_import:

file_prep_csv = csv.reader(file_prep, delimiter=',')

file_import_csv = csv.writer(file_import, delimiter=',')

headers = [':START_ID', ':END_ID', ':TYPE']

file_import_csv.writerow(headers)

for i, row in enumerate(file_prep_csv):

if i == 0:

continue

industry = row[2]

start_id = row[0] # code

end_id = get_md5(industry)

relation = [start_id, end_id, 'industry_of']

file_import_csv.writerow(relation)

# 生成 concept_of 关系对应的csv文件,该关系同样没有属性

def build_stock_concept(stock_concept_prep, relation_import):

"""Create an 'stock_industry' file in csv format that can be imported into Neo4j.

format -> :START_ID,:END_ID,:TYPE

stock concept

type -> concept_of

"""

with open(stock_concept_prep, 'r', encoding='utf-8') as file_prep, \

open(relation_import, 'w', encoding='utf-8',newline="") as file_import:

file_prep_csv = csv.reader(file_prep, delimiter=',')

file_import_csv = csv.writer(file_import, delimiter=',')

headers = [':START_ID', ':END_ID', ':TYPE']

file_import_csv.writerow(headers)

for i, row in enumerate(file_prep_csv):

if i == 0:

continue

concept = row[2]

start_id = row[0] # code

end_id = get_md5(concept)

relation = [start_id, end_id, 'concept_of']

file_import_csv.writerow(relation)

if __name__ == '__main__':

import_path = 'data1/import'

if not os.path.exists(import_path):

os.makedirs(import_path)

build_executive('data1/executive_prep.csv', 'data1/import/executive.csv')

build_stock('data1/stock_industry_prep.csv', 'data1/stock_concept_prep.csv',

'data1/import/stock.csv')

build_concept('data1/stock_concept_prep.csv', 'data1/import/concept.csv')

build_industry('data1/stock_industry_prep.csv', 'data1/import/industry.csv')

build_executive_stock('data1/executive_prep.csv', 'data1/import/executive_stock.csv')

build_stock_industry('data1/stock_industry_prep.csv', 'data1/import/stock_industry.csv')

build_stock_concept('data1/stock_concept_prep.csv', 'data1/import/stock_concept.csv')

运行结果如下:

构建知识图谱

原作者直接将抽取好的实体和关系csv文件导入neo4j生成知识图谱,注意csv文件的存储路径就行。

思考

构建人的实体时,重名问题具体怎么解决?

总结

感觉这个项目是原作者的一个作业,原作者将它分享到github上了。他构建知识图谱的重点在爬取网页数据和处理数据上,处理好数据以后直接导入neo4j生成知识图谱了。

查看原文