代碼可直接復制到python文件中進行運行
# 1. 文件內創建函數 # 內建函數和方法 # open() 打開文件 # read() 輸入 # readline() 輸入一行 # seek() 文件移動 # write() 輸出 # close() 關閉文件 # 寫入文件,執行完成后生成txt文件 file1 = open('name.txt', 'w') file1.write("20200202") file1.close() # 讀取文件 file2 = open('name.txt') str = file2.read() print(str) file2.close() # 編輯文件 file3 = open('name.txt', 'a') # 字符中帶\n輸入進行換行 file3.write("\n11111") file3.close() # 讀取一行 file4 = open('name.txt') print(file4.readline()) file4.close() # 逐行讀取 file5 = open('name.txt') for str_1 in file5.readlines(): print(str_1) file5.close() # 操作完成之后鼠標指針行首 file6 = open('name.txt') print(file6.readline()) # 回到行首 print(file6.seek(0)) file6.close() # 2.python異常的檢測和處理 try: a = 1 / 0 except Exception as e: print('捕獲到的異常是 %s' % e) finally: print('最終都會執行的語句') # 3.python的 可變參數 def howLong(first, *other): print(first) print(other) howLong('123', '1222', '1111') # 4.函數的迭代器和生成器 list1 = {1, 2, 3} it = iter(list1) # 迭代器next() print(next(it)) print(next(it)) print(next(it)) def frange(start, stop, step): x = start while x < stop: # 生成器關鍵字 yield yield x x += step for i in frange(10, 12, 0.5): print(i) # 5.Lambda表達式:匿名函數 add = lambda x, y: x + y print(add(2, 4)) # 6.python的內建函數 a = [1, 2, 34, 5, 6] # filter():夠快a中大於2的數 print(list(filter(lambda x: x > 2, a))) # map():依次a中的數加一 print(list(map(lambda x: x + 1, a))) # 多個列表處理:a,b中第一個元素相加 b = [3, 4, 5, 9] print(list(map(lambda x, y: x + y, a, b))) # reduce使用需要引入:完成數字累加 from functools import reduce print(reduce(lambda x, y: x + y, [1, 2, 3], 4)) # zip進行矩陣轉換 dicta = {'aa': 'a', 'bb': 'b', 'cc': 'c'} dictc = zip(dicta.values(), dicta.keys()) print(list(dictc)) # 7. python 的閉包:嵌套函數 def sum(a): def add(b): return a + b return add num27 = sum(2) print(num27(4)) # 8.python多線程 import threading from threading import current_thread class Mythread(threading.Thread): def run(self): print(current_thread().getName(), 'start') print('run') print(current_thread().getName(), 'start') t1 = Mythread() t1.start() t1.join() # 線程同步 print(current_thread().getName(), 'end') # 9.python正則表達式re # . 匹配任意單個字符 # ^ 以什么字符做開頭 # $ 以什么字符做結尾(從后向前進行匹配) # * 字符出現0~n次 # + 前面字符出現1~N次 # ? 前面字符出現0次或1次 # {m} 前面字符出現m的次 # {m,n} 前面字符出現m~n次 # [] 中括號中任意一個字符匹配成功即可 # | 字符選擇左邊或者右邊 # \d 匹配內容為數字 # \D 匹配非數字 # \s 匹配字符串 # () 進行分組 import re p = re.compile('.{3}') # 任意字符出現三次 print(p.match('d')) p1 = re.compile('jpg$') # 查找以jpg結尾的字符 print(p1.match('d')) p2 = re.compile('ca*') # 查找以jpg結尾的字符 print(p2.match('cat')) p3 = re.compile('a{4}') # 查找a出現4次 print(p3.match('caaaat')) p4 = re.compile('c[bcd]t') # 出現bcd中任意一個 print(p4.match('cat')) # 分組 p5 = re.compile(r'(\d+)-(\d+)-(\d+)') print(p5.match('2019-02-02')) # 匹配日期 print(p5.match('2019-02-02').group(1)) # 匹配年份 year, month, day = p5.match('2019-02-02').groups() # 匹配年份 print(year, month, day) # match是完全匹配進行分組,search是進行字符匹配搜索 print(p5.match('aaa2019-02-02')) print(p5.search('aaa2019-02-02')) # sub匹配替換 phone = '123-456-789 # 這是電話號碼' print(re.sub(r'#.*$', '', phone)) # 將警號后面替換為空 print(re.sub(r'\D', '', phone)) # 非數字替換為空 # 10. python日期函數函數庫 # import time print(time.time()) # 1970年到現在的時間 print(time.localtime()) print(time.strftime('%Y-%m-%d %H:%M:%S')) import datetime # datetime用作時間的修改 print(datetime.datetime.now()) new_time = datetime.timedelta(minutes=10) print(datetime.datetime.now() + new_time) # 十分鍾之后的時間 one_day = datetime.datetime(2019, 9, 9) new_day = datetime.timedelta(days=10) print(one_day + new_day) # 11.網頁數據采集與urllib from urllib import request url = 'http://www.baidu.com' response = request.urlopen(url, timeout=1) # print(response.read().decode('utf-8')) # 12.GET和POST請求 from urllib import parse from urllib import request data = bytes(parse.urlencode({'world': 'hello'}), encoding='utf8') # print(data) response = request.urlopen('http://httpbin.org/post', data=data) # print(response.read().decode('utf-8')) import urllib import socket try: response2 = request.urlopen('http://httpbin.org/get', timeout=1) # print(response2.read()) except urllib.error.URLError as e: if isinstance(e.reason, socket.timeout): print("time out") # 13.python的requests庫的使用 # get請求 import requests url2131 = 'http://httpbin.org/get' data2131 = {'key': 'value', 'abc': 'xyz'} response2131 = requests.get(url2131, data2131) # print(response2131.text) # post請求 url2132 = 'http://httpbin.org/post' data2132 = {'key': 'value', 'abc': 'xyz'} response2132 = requests.post(url2132, data2132) # print(response2132.json()) # 14.python的正則表達式爬取鏈接 # import requests # import re content = requests.get('http://www.cnu.cc/discoveryPage/hot-人像').text # print(content) patter2141 = re.compile(r'<a href="(.*?)".*?title">(.*?)</div>', re.S) results2141 = re.findall(patter2141, content) # print('ssssss', results2141) for result2141 in results2141: url2141, name2141 = result2141 # print(url2141, re.sub('\s', '', name2141)) # 15.爬蟲使用beautiful Soup的安裝使用 # pip3 install bs4 from bs4 import BeautifulSoup soup = BeautifulSoup(content, 'lxml') # print(soup.prettify()) # 格式化的處理 # print(soup.title) # 獲取title # print(soup.title.string) # 獲取title # print(soup.p) # 獲取p標簽 # print(soup.a) # 獲取a標簽 # print(soup.find(id='link3')) # 獲取id=link3的標簽 # 查找所有a標簽的鏈接 # for link in soup.find_all('a'): # print(link.get('href')) # print(soup.get_text()) # 獲取文檔中所有文本內容 # 16.爬蟲網頁標題 # from bs4 import BeautifulSoup # import requests headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8", "Connection": "close", "Cookie": "_gauges_unique_hour=1; _gauges_unique_day=1; _gauges_unique_month=1; _gauges_unique_year=1; _gauges_unique=1", "Referer": "http://www.infoq.com", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER" } url2161 = 'https://www.infoq.com/news/' # 取得網頁完整內容 def craw(url2162): response2162 = requests.get(url2162, headers=headers) print(response2162.text) # craw(url2161) # 取得新聞標題 def craw2(url2163): response2163 = requests.get(url2163, headers=headers) soup2163 = BeautifulSoup(response2163.text, 'lxml') for title_href in soup2163.find_all('div', class_='items__content'): print([title.get('title') for title in title_href.find_all('a') if title.get('title')]) # craw2(url2161) # # 翻頁 # for i in range(15, 46, 15): # url2164 = 'http://www.infoq.com/news/' + str(i) # # print(url) # craw2(url2164) # 17.python爬蟲爬取圖片下載 from bs4 import BeautifulSoup import requests import os import shutil headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8", "Connection": "close", "Cookie": "_gauges_unique_hour=1; _gauges_unique_day=1; _gauges_unique_month=1; _gauges_unique_year=1; _gauges_unique=1", "Referer": "http://www.infoq.com", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER" } url = 'http://www.infoq.com/presentations' # 下載圖片 # Requests 庫封裝復雜的接口,提供更人性化的 HTTP 客戶端,但不直接提供下載文件的函數。 # 需要通過為請求設置特殊參數 stream 來實現。當 stream 設為 True 時, # 上述請求只下載HTTP響應頭,並保持連接處於打開狀態, # 直到訪問 Response.content 屬性時才開始下載響應主體內容 def download_jpg(image_url, image_localpath): response = requests.get(image_url, stream=True) if response.status_code == 200: with open(image_localpath, 'wb') as f: response.raw.deconde_content = True shutil.copyfileobj(response.raw, f) # 取得演講圖片 def craw3(url): response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, 'lxml') for pic_href in soup.find_all('div', class_='items__content'): for pic in pic_href.find_all('img'): imgurl = pic.get('src') dir = os.path.abspath('.') filename = os.path.basename(imgurl) imgpath = os.path.join(dir, filename) print('開始下載 %s' % imgurl) download_jpg(imgurl, imgpath) # craw3(url) # 翻頁 j = 0 for i in range(12, 37, 12): url = 'http://www.infoq.com/presentations' + str(i) j += 1 print('第 %d 頁' % j) craw3(url)