GoodsKG

　　GoodsKG, 基於京東網站的1300種商品上下級概念，約10萬商品品牌，約65萬品牌銷售關系，商品描述維度等知識庫，基於該知識庫可以支持商品屬性庫構建，商品銷售問答，品牌物品生產等知識查詢服務，也可用於情感分析等下游應用．

項目介紹

　　概念層級知識是整個常識知識體系中的重要組成部分。概念層級目前包括百科性概念層級和專業性概念層級兩類，百度百科概念體系以及互動百科分類體系是其中的一個代表。就后者而言，目前出現了許多垂直行業概念層級，如醫療領域概念體系，墊上領域概念體系，其中以淘寶、京東等為代表的電商網站在以商品為中心上，構建起了一種商品概念目錄層級以及商品與品牌，品牌與品牌之間的關聯關系。
　　本項目認為，電商網站中的商品分類目錄能夠供我們構建起一個商品概念體系，基於商品首頁，我們可以得到商品與商品品牌之間的關系，商品的屬性以及屬性的取值信息。基於這類信息，又可以進一步得到商品的畫像以及商品品牌的畫像。基於該畫像。可以對自然語言處理處理的幾個下游應用帶來幫助，如商品品牌識別，商品對象及屬性級別情感分析，商品評價短語庫構建，商品品牌競爭關系梳理等。
　　因此，本項目以京東電商為實驗數據來源，采集京東商品目錄樹，並獲取其對應的底層商品概念信息，組織形成商品知識圖譜。目前，該圖譜包括有概念的上下位is a關系以及商品品牌與商品之間的銷售sale關系共兩類關系，涉及商品概念數目1300+，商品品牌數目約10萬+，屬性數目幾千種，關系數目65萬規模。該項目可以進一步增強商品領域概念體系的應用，對自然語言處理處理的幾個下游應用帶來幫助，如商品品牌識別，商品對象及屬性級別情感分析，商品評價短語庫構建，商品品牌競爭關系梳理等提供基礎性的概念服務。

數據介紹

1, 基本數據內容

2, is-a概念上下位關系

MATCH p=()-[r:is_a]->() RETURN p LIMIT 25

3, sale銷售關系

MATCH p=()-[r:sales]->() RETURN p LIMIT 25

4, 混合關聯關系

MATCH (n:Product) where n.name=’手機’ RETURN n LIMIT 25

GitHub源碼

https://github.com/chenjj9527/ProductKnowledgeGraph-master

  1 #!/usr/bin/env python3
  2 # coding: utf-8
  3 # File: build_kg.py
  4 # Author: cjj
  5 # Date: 19-12-23
  6 
  7 import urllib.request
  8 from urllib.parse import quote_plus
  9 from lxml import etree
 10 import gzip
 11 import chardet
 12 import json
 13 import pymongo
 14 
 15 class GoodSchema:
 16     def __init__(self):
 17         self.conn = pymongo.MongoClient()
 18         return
 19 
 20     '''獲取搜索頁'''
 21     def get_html(self, url):
 22         headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/600.5.17 (KHTML, like Gecko) Version/8.0.5 Safari/600.5.17"}
 23         try:
 24             req = urllib.request.Request(url, headers=headers)
 25             data = urllib.request.urlopen(req).read()
 26             coding = chardet.detect(data)
 27             html = data.decode(coding['encoding'])
 28         except:
 29             req = urllib.request.Request(url, headers=headers)
 30             data = urllib.request.urlopen(req).read()
 31             html = data.decode('gbk')
 32 
 33 
 34         return html
 35 
 36     '''獲取詳情頁'''
 37     def get_detail_html(self, url):
 38         headers = {
 39             "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
 40             "accept-encoding": "gzip, deflate, br",
 41             "accept-language": "en-US,en;q=0.9",
 42             "cache-control": "max-age=0",
 43             "referer": "https://www.jd.com/allSort.aspx",
 44             "upgrade-insecure-requests": 1,
 45             "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/66.0.3359.181 Chrome/66.0.3359.181 Safari/537.36"
 46         }
 47         try:
 48             req = urllib.request.Request(url, headers=headers)
 49             data = urllib.request.urlopen(req).read()
 50             html = gzip.decompress(data)
 51             coding = chardet.detect(html)
 52             html = html.decode(coding['encoding'])
 53         except Exception as e:
 54             req = urllib.request.Request(url, headers=headers)
 55             data = urllib.request.urlopen(req).read()
 56             html = gzip.decompress(data)
 57             html = html.decode('gbk')
 58         return html
 59 
 60 
 61     '''根據主頁獲取數據'''
 62     def home_list(self):
 63         url = 'https://www.jd.com/allSort.aspx'
 64         html = self.get_html(url)
 65         selector = etree.HTML(html)
 66         divs = selector.xpath('//div[@class= "category-item m"]')
 67         for indx, div in enumerate(divs):
 68             first_name = div.xpath('./div[@class="mt"]/h2/span/text()')[0]
 69             second_classes = div.xpath('./div[@class="mc"]/div[@class="items"]/dl')
 70             for dl in second_classes:
 71                 second_name = dl.xpath('./dt/a/text()')[0]
 72                 third_classes = ['https:' + i for i in dl.xpath('./dd/a/@href')]
 73                 third_names = dl.xpath('./dd/a/text()')
 74                 for third_name, url in zip(third_names, third_classes):
 75                     try:
 76                         attr_dict = self.parser_goods(url)
 77                         attr_brand = self.collect_brands(url)
 78                         attr_dict.update(attr_brand)
 79                         data = {}
 80                         data['fisrt_class'] = first_name
 81                         data['second_class'] = second_name
 82                         data['third_class'] = third_name
 83                         data['attrs'] = attr_dict
 84                         self.conn['goodskg']['data'].insert(data)
 85                         print(indx, len(divs), first_name, second_name, third_name)
 86                     except Exception as e:
 87                         print(e)
 88         return
 89 
 90     '''解析商品數據'''
 91     def parser_goods(self, url):
 92         html = self.get_detail_html(url)
 93         selector = etree.HTML(html)
 94         title = selector.xpath('//title/text()')
 95         attr_dict = {}
 96         other_attrs = ''.join([i for i in html.split('\n') if 'other_exts' in i])
 97         other_attr = other_attrs.split('other_exts =[')[-1].split('];')[0]
 98         if other_attr and 'var other_exts ={};' not in other_attr:
 99             for attr in other_attr.split('},'):
100                 if '}' not in attr:
101                     attr = attr + '}'
102                 data = json.loads(attr)
103                 key = data['name']
104                 value = data['value_name']
105                 attr_dict[key] = value
106         attr_divs = selector.xpath('//div[@class="sl-wrap"]')
107         for div in attr_divs:
108             attr_name = div.xpath('./div[@class="sl-key"]/span/text()')[0].replace('：','')
109             attr_value = ';'.join([i.replace('  ','') for i in div.xpath('./div[@class="sl-value"]/div/ul/li/a/text()')])
110             attr_dict[attr_name] = attr_value
111 
112         return attr_dict
113 
114     '''解析品牌數據'''
115     def collect_brands(self, url):
116         attr_dict = {}
117         brand_url = url + '&sort=sort_rank_asc&trans=1&md=1&my=list_brand'
118         html = self.get_html(brand_url)
119         if 'html' in html:
120             return attr_dict
121         data = json.loads(html)
122         brands = []
123 
124         if 'brands' in data and data['brands'] is not None:
125             brands = [i['name'] for i in data['brands']]
126         attr_dict['品牌'] = ';'.join(brands)
127 
128         return attr_dict
129 
130 
131 
132 if __name__ == '__main__':
133     handler = GoodSchema()
134     handler.home_list()

  1 #!/usr/bin/env python3
  2 # coding: utf-8
  3 # File: build_kg.py
  4 # Author: cjj
  5 # Date: 19-12-23
  6 
  7 import json
  8 import os
  9 from py2neo import Graph, Node, Relationship
 10 
 11 
 12 class GoodsKg:
 13     def __init__(self):
 14         cur = '/'.join(os.path.abspath(__file__).split('/')[:-1])
 15         self.data_path = os.path.join(cur, 'data/goods_info.json')
 16         self.g = Graph(
 17             host="127.0.0.1",  # neo4j 搭載服務器的ip地址，ifconfig可獲取到
 18             http_port=7474,  # neo4j 服務器監聽的端口號
 19             user="neo4j",  # 數據庫user name，如果沒有更改過，應該是neo4j
 20             password="111111")
 21         return
 22 
 23     '''讀取數據'''
 24     def read_data(self):
 25         rels_goods = []
 26         rels_brand = []
 27         goods_attrdict = {}
 28         concept_goods = set()
 29         concept_brand = set()
 30         count = 0
 31         for line in open(self.data_path,encoding='UTF-8'):
 32             count += 1
 33             print(count)
 34             line = line.strip()
 35             data = json.loads(line)
 36             first_class = data['fisrt_class'].replace("'",'')
 37             second_class = data['second_class'].replace("'",'')
 38             third_class = data['third_class'].replace("'",'')
 39             attr = data['attrs']
 40             concept_goods.add(first_class)
 41             concept_goods.add(second_class)
 42             concept_goods.add(third_class)
 43             rels_goods.append('@'.join([second_class, 'is_a', '屬於', first_class]))
 44             rels_goods.append('@'.join([third_class, 'is_a', '屬於', second_class]))
 45 
 46             if attr and '品牌' in attr:
 47                 brands = attr['品牌'].split(';')
 48                 for brand in brands:
 49                     brand = brand.replace("'",'')
 50                     concept_brand.add(brand)
 51                     rels_brand.append('@'.join([brand, 'sales', '銷售', third_class]))
 52 
 53             goods_attrdict[third_class] = {name:value for name,value in attr.items() if name != '品牌'}
 54 
 55         return concept_brand, concept_goods, rels_goods, rels_brand
 56 
 57     '''構建圖譜'''
 58     def create_graph(self):
 59         concept_brand, concept_goods, rels_goods, rels_brand = self.read_data()
 60         print('creating nodes....')
 61         self.create_node('Product', concept_goods)
 62         self.create_node('Brand', concept_brand)
 63         print('creating edges....')
 64         self.create_edges(rels_goods, 'Product', 'Product')
 65         self.create_edges(rels_brand, 'Brand', 'Product')
 66         return
 67 
 68     '''批量建立節點'''
 69     def create_node(self, label, nodes):
 70         pairs = []
 71         bulk_size = 1000
 72         batch = 0
 73         bulk = 0
 74         batch_all = len(nodes)//bulk_size
 75         print(batch_all)
 76         for node_name in nodes:
 77             sql = """CREATE(:%s {name:'%s'})""" % (label, node_name)
 78             pairs.append(sql)
 79             bulk += 1
 80             if bulk % bulk_size == 0 or bulk == batch_all+1:
 81                 sqls = '\n'.join(pairs)
 82                 self.g.run(sqls)
 83                 batch += 1
 84                 print(batch*bulk_size,'/', len(nodes), 'finished')
 85                 pairs = []
 86         return
 87 
 88 
 89     '''構造圖譜關系邊'''
 90     def create_edges(self, rels, start_type, end_type):
 91         batch = 0
 92         count = 0
 93         for rel in set(rels):
 94             count += 1
 95             rel = rel.split('@')
 96             start_name = rel[0]
 97             end_name = rel[3]
 98             rel_type = rel[1]
 99             rel_name = rel[2]
100             sql = 'match (m:%s), (n:%s) where m.name = "%s" and n.name = "%s" create (m)-[:%s{name:"%s"}]->(n)' %(start_type, end_type, start_name, end_name,rel_type,rel_name)
101             try:
102                 self.g.run(sql)
103             except Exception as e:
104                 print(e)
105             if count%10 == 0:
106                 print(count)
107 
108         return
109 
110 
111 if __name__ =='__main__':
112     handler = GoodsKg()
113     handler.create_graph()

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 20200926 DataFunTalk：知識圖譜專場（1）京東，知識圖譜在電商實踐知識圖譜基本概念知識圖譜基本概念知識圖譜基礎概念知識圖譜 - 基礎概念梳理最全知識圖譜的概念篇通用概念知識圖譜介紹電商產品設計：商品的基本屬性與銷售屬性美團商品知識圖譜的構建及應用十七，教育知識圖譜的概念模型構建（EKGCM）