python BeautifulSoup 获取页面多个子节点中的各个节点的内容


页面html格式为

<tr bgcolor="#7bb5de">
<td style="border-bottom: 1px solid #C9D8AD" width="118" align="center" bgcolor="#D9E6FF">
<p align="center">
lyl5577d92</p></td>
<td style="border-bottom: 1px solid #C9D8AD" width="96" align="center" bgcolor="#D9E6FF">
<p align="center">李永利</p></td>
<td align="center" style="border-bottom: 1px solid #C9D8AD" width="105" bgcolor="#D9E6FF">
<div align="center"><font color="#FF0000">lyl5577d</font></div>
</td>
<td align="center" style="border-bottom: 1px solid #C9D8AD" width="153" bgcolor="#D9E6FF">
<div align="center">469680008</div>
</td>
<td align="center" style="border-bottom: 1px solid #C9D8AD" width="218" bgcolor="#D9E6FF">
<div align="center">2016-05-21 15:24:27.0</div>
</td>
<td align="center" style="border-bottom: 1px solid #C9D8AD" width="171" bgcolor="#D9E6FF">
<div align="center">0</div>
</td>
<td align="center" style="border-bottom: 1px solid #C9D8AD" width="119" bgcolor="#D9E6FF">0</td>
<td align="center" style="border-bottom: 1px solid #C9D8AD" width="111" bgcolor="#D9E6FF">0</td>
<td align="center" style="border-bottom: 1px solid #C9D8AD" width="87" bgcolor="#D9E6FF">0</td>
<td align="center" style="border-bottom: 1px solid #C9D8AD" width="128" bgcolor="#D9E6FF">0</td>
</tr>

 

 1 import httplib  2 from BeautifulSoup import BeautifulSoup  3 
 4 
 5 def main():  6     f = open('result','a')  7 
 8     headers = {'Content-Type':'application/x-www-form-urlencoded',  9         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 10         'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 11         'Accept-Encoding': 'gzip, deflate', 12         'Referer': 'http://xxx.xxx.com/admin/userlist', 13         'Cookie': 'JSESSIONID=9F6F2D03D2C11400B3D6731E90D73117', 14         'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 6.1; rv:46.0) Gecko/20100101 Firefox/46.0', 15  } 16           
17     conn = httplib.HTTPConnection('*.*.*.*', timeout=50) 18 
19     for p in range(1,1287): 20         print p 21         conn.request(method='GET', 22                         url="/admin/userlist?toPage=%s&sessionID=" % str(p), 23                         headers=headers) 24         resp = conn.getresponse() 25         html_doc = resp.read() 26         mainSoup = BeautifulSoup(html_doc) 27         for s in mainSoup.findAll('tr', attrs={'bgcolor':'#7bb5de'}): 28             if 'style' not in str(s): 29                 continue
30             for d in s.findAll('td'): 31                 print d.getText(), 32                 f.write("%s " % d.getText().encode('utf-8')) #f.write("%s " % d.getText())==> UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-2: ordinal not in range(128) 33             f.write("%s\n" % d.getText().encode('utf-8')) 34             print
35  f.close() 36  conn.close() 37 
38 
39 if __name__ == '__main__': 40  main() 41     


免责声明!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系本站邮箱yoyou2525@163.com删除。



 
粤ICP备18138465号  © 2018-2025 CODEPRJ.COM