GoodsKG
GoodsKG, 基於京東網站的1300種商品上下級概念,約10萬商品品牌,約65萬品牌銷售關系,商品描述維度等知識庫,基於該知識庫可以支持商品屬性庫構建,商品銷售問答,品牌物品生產等知識查詢服務,也可用於情感分析等下游應用.
項目介紹
概念層級知識是整個常識知識體系中的重要組成部分。概念層級目前包括百科性概念層級和專業性概念層級兩類,百度百科概念體系以及互動百科分類體系是其中的一個代表。就后者而言,目前出現了許多垂直行業概念層級,如醫療領域概念體系,墊上領域概念體系,其中以淘寶、京東等為代表的電商網站在以商品為中心上,構建起了一種商品概念目錄層級以及商品與品牌,品牌與品牌之間的關聯關系。
本項目認為,電商網站中的商品分類目錄能夠供我們構建起一個商品概念體系,基於商品首頁,我們可以得到商品與商品品牌之間的關系,商品的屬性以及屬性的取值信息。基於這類信息,又可以進一步得到商品的畫像以及商品品牌的畫像。基於該畫像。可以對自然語言處理處理的幾個下游應用帶來幫助,如商品品牌識別,商品對象及屬性級別情感分析,商品評價短語庫構建,商品品牌競爭關系梳理等。
因此,本項目以京東電商為實驗數據來源,采集京東商品目錄樹,並獲取其對應的底層商品概念信息,組織形成商品知識圖譜。目前,該圖譜包括有概念的上下位is a關系以及商品品牌與商品之間的銷售sale關系共兩類關系,涉及商品概念數目1300+,商品品牌數目約10萬+,屬性數目幾千種,關系數目65萬規模。該項目可以進一步增強商品領域概念體系的應用,對自然語言處理處理的幾個下游應用帶來幫助,如商品品牌識別,商品對象及屬性級別情感分析,商品評價短語庫構建,商品品牌競爭關系梳理等提供基礎性的概念服務。
數據介紹
1, 基本數據內容
2, is-a概念上下位關系
MATCH p=()-[r:is_a]->() RETURN p LIMIT 25
3, sale銷售關系
MATCH p=()-[r:sales]->() RETURN p LIMIT 25
4, 混合關聯關系
MATCH (n:Product) where n.name=’手機’ RETURN n LIMIT 25
GitHub源碼
https://github.com/chenjj9527/ProductKnowledgeGraph-master
1 #!/usr/bin/env python3 2 # coding: utf-8 3 # File: build_kg.py 4 # Author: cjj 5 # Date: 19-12-23 6 7 import urllib.request 8 from urllib.parse import quote_plus 9 from lxml import etree 10 import gzip 11 import chardet 12 import json 13 import pymongo 14 15 class GoodSchema: 16 def __init__(self): 17 self.conn = pymongo.MongoClient() 18 return 19 20 '''獲取搜索頁''' 21 def get_html(self, url): 22 headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/600.5.17 (KHTML, like Gecko) Version/8.0.5 Safari/600.5.17"} 23 try: 24 req = urllib.request.Request(url, headers=headers) 25 data = urllib.request.urlopen(req).read() 26 coding = chardet.detect(data) 27 html = data.decode(coding['encoding']) 28 except: 29 req = urllib.request.Request(url, headers=headers) 30 data = urllib.request.urlopen(req).read() 31 html = data.decode('gbk') 32 33 34 return html 35 36 '''獲取詳情頁''' 37 def get_detail_html(self, url): 38 headers = { 39 "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", 40 "accept-encoding": "gzip, deflate, br", 41 "accept-language": "en-US,en;q=0.9", 42 "cache-control": "max-age=0", 43 "referer": "https://www.jd.com/allSort.aspx", 44 "upgrade-insecure-requests": 1, 45 "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/66.0.3359.181 Chrome/66.0.3359.181 Safari/537.36" 46 } 47 try: 48 req = urllib.request.Request(url, headers=headers) 49 data = urllib.request.urlopen(req).read() 50 html = gzip.decompress(data) 51 coding = chardet.detect(html) 52 html = html.decode(coding['encoding']) 53 except Exception as e: 54 req = urllib.request.Request(url, headers=headers) 55 data = urllib.request.urlopen(req).read() 56 html = gzip.decompress(data) 57 html = html.decode('gbk') 58 return html 59 60 61 '''根據主頁獲取數據''' 62 def home_list(self): 63 url = 'https://www.jd.com/allSort.aspx' 64 html = self.get_html(url) 65 selector = etree.HTML(html) 66 divs = selector.xpath('//div[@class= "category-item m"]') 67 for indx, div in enumerate(divs): 68 first_name = div.xpath('./div[@class="mt"]/h2/span/text()')[0] 69 second_classes = div.xpath('./div[@class="mc"]/div[@class="items"]/dl') 70 for dl in second_classes: 71 second_name = dl.xpath('./dt/a/text()')[0] 72 third_classes = ['https:' + i for i in dl.xpath('./dd/a/@href')] 73 third_names = dl.xpath('./dd/a/text()') 74 for third_name, url in zip(third_names, third_classes): 75 try: 76 attr_dict = self.parser_goods(url) 77 attr_brand = self.collect_brands(url) 78 attr_dict.update(attr_brand) 79 data = {} 80 data['fisrt_class'] = first_name 81 data['second_class'] = second_name 82 data['third_class'] = third_name 83 data['attrs'] = attr_dict 84 self.conn['goodskg']['data'].insert(data) 85 print(indx, len(divs), first_name, second_name, third_name) 86 except Exception as e: 87 print(e) 88 return 89 90 '''解析商品數據''' 91 def parser_goods(self, url): 92 html = self.get_detail_html(url) 93 selector = etree.HTML(html) 94 title = selector.xpath('//title/text()') 95 attr_dict = {} 96 other_attrs = ''.join([i for i in html.split('\n') if 'other_exts' in i]) 97 other_attr = other_attrs.split('other_exts =[')[-1].split('];')[0] 98 if other_attr and 'var other_exts ={};' not in other_attr: 99 for attr in other_attr.split('},'): 100 if '}' not in attr: 101 attr = attr + '}' 102 data = json.loads(attr) 103 key = data['name'] 104 value = data['value_name'] 105 attr_dict[key] = value 106 attr_divs = selector.xpath('//div[@class="sl-wrap"]') 107 for div in attr_divs: 108 attr_name = div.xpath('./div[@class="sl-key"]/span/text()')[0].replace(':','') 109 attr_value = ';'.join([i.replace(' ','') for i in div.xpath('./div[@class="sl-value"]/div/ul/li/a/text()')]) 110 attr_dict[attr_name] = attr_value 111 112 return attr_dict 113 114 '''解析品牌數據''' 115 def collect_brands(self, url): 116 attr_dict = {} 117 brand_url = url + '&sort=sort_rank_asc&trans=1&md=1&my=list_brand' 118 html = self.get_html(brand_url) 119 if 'html' in html: 120 return attr_dict 121 data = json.loads(html) 122 brands = [] 123 124 if 'brands' in data and data['brands'] is not None: 125 brands = [i['name'] for i in data['brands']] 126 attr_dict['品牌'] = ';'.join(brands) 127 128 return attr_dict 129 130 131 132 if __name__ == '__main__': 133 handler = GoodSchema() 134 handler.home_list()
1 #!/usr/bin/env python3 2 # coding: utf-8 3 # File: build_kg.py 4 # Author: cjj 5 # Date: 19-12-23 6 7 import json 8 import os 9 from py2neo import Graph, Node, Relationship 10 11 12 class GoodsKg: 13 def __init__(self): 14 cur = '/'.join(os.path.abspath(__file__).split('/')[:-1]) 15 self.data_path = os.path.join(cur, 'data/goods_info.json') 16 self.g = Graph( 17 host="127.0.0.1", # neo4j 搭載服務器的ip地址,ifconfig可獲取到 18 http_port=7474, # neo4j 服務器監聽的端口號 19 user="neo4j", # 數據庫user name,如果沒有更改過,應該是neo4j 20 password="111111") 21 return 22 23 '''讀取數據''' 24 def read_data(self): 25 rels_goods = [] 26 rels_brand = [] 27 goods_attrdict = {} 28 concept_goods = set() 29 concept_brand = set() 30 count = 0 31 for line in open(self.data_path,encoding='UTF-8'): 32 count += 1 33 print(count) 34 line = line.strip() 35 data = json.loads(line) 36 first_class = data['fisrt_class'].replace("'",'') 37 second_class = data['second_class'].replace("'",'') 38 third_class = data['third_class'].replace("'",'') 39 attr = data['attrs'] 40 concept_goods.add(first_class) 41 concept_goods.add(second_class) 42 concept_goods.add(third_class) 43 rels_goods.append('@'.join([second_class, 'is_a', '屬於', first_class])) 44 rels_goods.append('@'.join([third_class, 'is_a', '屬於', second_class])) 45 46 if attr and '品牌' in attr: 47 brands = attr['品牌'].split(';') 48 for brand in brands: 49 brand = brand.replace("'",'') 50 concept_brand.add(brand) 51 rels_brand.append('@'.join([brand, 'sales', '銷售', third_class])) 52 53 goods_attrdict[third_class] = {name:value for name,value in attr.items() if name != '品牌'} 54 55 return concept_brand, concept_goods, rels_goods, rels_brand 56 57 '''構建圖譜''' 58 def create_graph(self): 59 concept_brand, concept_goods, rels_goods, rels_brand = self.read_data() 60 print('creating nodes....') 61 self.create_node('Product', concept_goods) 62 self.create_node('Brand', concept_brand) 63 print('creating edges....') 64 self.create_edges(rels_goods, 'Product', 'Product') 65 self.create_edges(rels_brand, 'Brand', 'Product') 66 return 67 68 '''批量建立節點''' 69 def create_node(self, label, nodes): 70 pairs = [] 71 bulk_size = 1000 72 batch = 0 73 bulk = 0 74 batch_all = len(nodes)//bulk_size 75 print(batch_all) 76 for node_name in nodes: 77 sql = """CREATE(:%s {name:'%s'})""" % (label, node_name) 78 pairs.append(sql) 79 bulk += 1 80 if bulk % bulk_size == 0 or bulk == batch_all+1: 81 sqls = '\n'.join(pairs) 82 self.g.run(sqls) 83 batch += 1 84 print(batch*bulk_size,'/', len(nodes), 'finished') 85 pairs = [] 86 return 87 88 89 '''構造圖譜關系邊''' 90 def create_edges(self, rels, start_type, end_type): 91 batch = 0 92 count = 0 93 for rel in set(rels): 94 count += 1 95 rel = rel.split('@') 96 start_name = rel[0] 97 end_name = rel[3] 98 rel_type = rel[1] 99 rel_name = rel[2] 100 sql = 'match (m:%s), (n:%s) where m.name = "%s" and n.name = "%s" create (m)-[:%s{name:"%s"}]->(n)' %(start_type, end_type, start_name, end_name,rel_type,rel_name) 101 try: 102 self.g.run(sql) 103 except Exception as e: 104 print(e) 105 if count%10 == 0: 106 print(count) 107 108 return 109 110 111 if __name__ =='__main__': 112 handler = GoodsKg() 113 handler.create_graph()