Github敏感信息泄露
1.背景
公司已經出現好幾次敏感信息泄露,處理方案都是被動應付式的,出現泄露就去github排查下,效果並不大。所以考慮自己搭建或使用已有開源的項目,參考鏈接如下:
https://www.freebuf.com/articles/web/173479.html 自己動手打造Github代碼泄露監控工具
https://www.freebuf.com/sectool/188102.html 自己動手打造Github代碼泄露監控工具之改進篇
兩款成熟的產品:
https://blog.csdn.net/u011728305/article/details/79970586 hawkeye
2.手把手
2.1 環境
Python3.8 win7 charm
2.2 代碼
# -*- coding: utf-8 -*-
from lxml import html
import requests
import configparser
import csv
from time import sleep
from tqdm import tqdm
from email.utils import parseaddr,formataddr
from email.mime.multipart import MIMEMultipart
from email.header import Header
from email.mime.base import MIMEBase
def login_github(gUser,gPass):
login_url = 'https://github.com/login'
session_url = 'https://github.com/session'
try:
s = requests.session()
resp = s.get(login_url).text
dom_tree = html.etree.HTML(resp)
key = dom_tree.xpath('//input[@name="authenticity_token"]/@value')
user_data = {
'commit': 'Sign in',
'utf8': '✓',
'authenticity_token': key,
'login': gUser,
'password': gPass
}
print(user_data)
dl = s.post(session_url,data=user_data)
if dl.status_code == 200:
# s.get('https://github.com/search?p=1&q=1111.com&type=Code')
# print(s.get('https://github.com/search?p=1&q=1111.com&type=Code').text) #驗證
return s
except:
print('異常')
def hunter(gUser,gPass,Keyword,payloads):
global sensitive_list
global tUrls
sensitive_list = []
tUrls = []
try:
s = login_github(gUser,gPass)
print('登陸成功,正在檢索泄露信息')
for page in tqdm(range(1,2)):
search_code = 'https://github.com/search?p='+str(page)+'&q='+keyword+'&type=Code'
print(search_code)
resp = s.get(search_code)
sleep(1)
results_code = resp.text
print(results_code)
dom_tree_code = html.etree.HTML(results_code)
Urls = dom_tree_code.xpath('//div[@class="f4 text-normal"]/a/@href')
users = dom_tree_code.xpath('//a[@class="link-gray"]/text()')
datetime = dom_tree_code.xpath('//relative-time/text()')
filename = dom_tree_code.xpath('//div[@class="f4 text-normal"]/a/text()')
with open('leak.csv', 'w', encoding='utf-8', newline='') as file:
writer = csv.writer(file)
writer.writerow(['URL', 'Username', 'Upload Time', 'Filename'])
for i in range(len(Urls)):
for url in Urls:
url = 'https://github.com'+url
tUrls.append(url)
# writer.writerow([tUrls[i],users[i],datetime[i],filename[i]])
for raw_url in Urls:
url = 'https://raw.githubusercontent.com' + raw_url.replace('/blob', '')
code = requests.get(url)
if code.status_code == 200:
code = code.text
for payload in payloads:
if payload in code:
leak_url = '命中的Payload為:' + payload + '\r\n' + 'https://github.com' + raw_url + '\r\n\r\n\r\n' + '代碼如下: \r\n' + code + '\r\n\r\n'
sensitive_list.append(leak_url)
return sensitive_list
except Exception as e:
print(e)
def send_warning(host,username,password,sender,receivers,content):
def _format_addr(s):
name,addr = parseaddr(s)
return formataddr(Header(name, 'utf-8').encode(), addr)
msg = MIMEMultipart()
msg['From'] = _format_addr('Github安全監控<%s>' % sender)
msg['To'] = ''.join(receivers)
Subject = 'Github敏感信息泄露通知'
msg['Subject'] = Header(Subject, 'utf-8').encode()
msg.attach(MIMEText('Dear all \r\n\r\n請注意,懷疑Github上已經上傳敏感信息!以下是可能存在敏感信息的倉庫!\r\n\r\n' + content + '\r\n\r\n'))
with open('leak.csv', 'rb') as f:
m = MIMEBase('excel', 'csv', filename='leak.csv')
m.add_header('Content-Disposition', 'attachment', filename='leak.csv')
m.add_header('Content-ID', '<0>')
m.add_header('X-Attachment-ID', '0')
m.set_payload(f.read())
encoders.encode_base64(m)
msg.attach(m)
try:
server = smtplib.SMTP(host, 25)
server.login(username, password)
server.sendmail(sender, receivers, msg.as_string())
print('郵件發送成功!')
except Exception as err:
print(err)
server.quit()
if __name__ == '__main__':
config = configparser.ConfigParser()
config.read('info.ini')
g_User = config['Github']['user']
g_Pass = config['Github']['password']
host = config['EMAIL']['host']
m_User = config['EMAIL']['user']
m_Pass = config['EMAIL']['password']
m_sender = config['SENDER']['sender']
receivers = []
for k in config['RECEIVER']:
receivers.append(config['RECEIVER'][k])
keyword = config['KEYWORD']['keyword']
payloads = []
for key in config['PAYLOADS']:
payloads.append(config['PAYLOADS'][key])
sensitive_list = hunter(g_User, g_Pass, keyword, payloads)
if sensitive_list:
print('\033[1;31;0m警告:找到敏感信息!\r\n\033[0m')
print('開始發送告警郵件......')
content = ''.join(sensitive_list)
send_warning(host, m_User, m_Pass, m_sender, receivers, content)
else:
print('恭喜:未找到敏感信息!\r\n')
print('所有檢查已完成,已生成報表!\r\n')
print('開始發送報表......\r\n')
2.3 代碼分析
首先看模塊
from lxml import html
import requests
import configparser
import csv
from time import sleep
from tqdm import tqdm
from email.utils import parseaddr,formataddr
from email.mime.multipart import MIMEMultipart
from email.header import Header
from email.mime.base import MIMEBase
主要關注lxml模塊,其他模塊系統自帶或pip自動安裝即可。
安裝lxml模塊,下載鏈接: https://www.lfd.uci.edu/~gohlke/pythonlibs/#lxml,下載相應版本, 直接安裝會平台報錯,將lxml-4.4.2-cp38-cp38-win32.whl改為lxml-4.4.2-cp38-cp38m-win32.whl,第二個cp38后增加了一個m,再次pip成功安裝。
Login_Github()函數,實現github登錄,注意
dl = s.post(session_url,data=user_data)
if dl.status_code == 200:
發起post請求后,如果不加一個狀態判斷,極有可能出錯,以游客的形式請求。
key = dom_tree.xpath('//input[@name="authenticity_token"]/@value')
通過css獲取token值
Hunder()、send_warning()、ini文件配置
問題:1.沒有去除重復,搜索出來的結果可能重復出現,去除重復的方法,以用戶、文件名、泄露的代碼、代碼泄露時間為變量產生一個mid值,mid值相同的去除掉。
2.找到泄露的代碼后保存整個文件的代碼,內容有點多,很多信息沒必要獲取,可以找到泄露點,在獲取所在的行,得到所在行的上下幾行。
3.發送郵件報警的時候,第一次我們發送整個泄露點,但是第二次我們的關注點應該是新增的泄漏點,所以和問題1配合,新增的mid值即為新增的泄露點。
3.改進
4.gsil
5.hawkeye