使用Python處理CSV文件的一些代碼示例


筆記:使用Python處理CSV文件的一些代碼示例,來自於《Python數據分析基礎》一書,有刪改

# 讀寫CSV文件,不使用CSV模塊,僅使用基礎Python 
# 20181110 wangml

#!/usr/bin/env python3

input_file = 'D:\wangm\Documents\learning\code\python\supplier_data.csv'
output_file = 'D:\wangm\Documents\learning\code\python\supplier_data_out.csv'

# 分別以讀、寫方式打開input_file、output_file,當以 w 方式打開的文件不存在,則創建它
with open(input_file, 'r', newline='') as filereader:
    with open(output_file, 'w', newline='') as filewriter:
        # 讀取一行文件內容
        header = filereader.readline()
        header = header.strip()
        header_list = header.split(',')
        print(header_list)
        filewriter.write(','.join(map(str, header_list))+'\n')
        for row in filereader:
            row = row.strip()
            row_list = row.split(',')
            print(row_list)
            filewriter.write(','.join(map(str, row_list))+'\n')
# 使用CSV模塊讀寫CSV文件
# 20181112 wangml
# csv_pandas_1
#!/usr/bin/env python3
# 導入CSV庫
import csv
input_file = 'D:\wangm\Documents\learning\code\python\supplier_data.csv'
output_file = 'D:\wangm\Documents\learning\code\python\supplier_data_out.csv'
with open(input_file, 'r', newline='') as csv_in_file:
    with open(output_file, 'w', newline='') as csv_out_file:
        # 使用CVS模塊中csv.reader()、csv.writer()函數,創建一個讀取對象、一個寫入對象
        # delimiter指定CSV文件的分隔符,默認為 , 逗號
        filereader = csv.reader(csv_in_file, delimiter=',')
        filewriter = csv.writer(csv_out_file, delimiter=',')
        header = next(filereader)
        filewriter.writerow(header)
        # 循環,每次從CSV讀取文件中讀取一行數據,並將其打印出來,然后寫入CSV寫入對象
        for row_list in filereader:
            print(row_list)
            filewriter.writerow(row_list)
        # 篩選符合條件的行
        for row_list in filereader:
            #print(row_list[1])
            name = str(row_list[0]).strip()
            #print(row_list[3])
            cost = str(row_list[3]).strip('$').replace(',', '')
            #print(cost)
            #print(type(cost))
            # 選擇name為z或者cost大於600的row,此處使用float()函數將cost由str類型轉換為flost
            if name == 'z' or float(cost) > 600.0:
                filewriter.writerow(row_list)
# # csv_pandas_1
#!/usr/bin/env python3
import pandas as pd
input_file = 'D:\wangm\Documents\learning\code\python\supplier_data.csv'
output_file = 'D:\wangm\Documents\learning\code\python\supplier_data_out.csv'
# 使用pandas庫函數pandas.read_csv()讀取一個CSV文件,並由此創建一個數據框對象
data_frame = pd.read_csv(input_file)
# 通過列名作為index選取該數據框中的指定列
data_frame['Cost'] = data_frame['Cost'].str.strip('$').astype(float)
#print(type(data_frame['Cost']))
data_frame_value_meets_condition = data_frame.loc[(data_frame['Name'].str.contains('Z')) | (data_frame['Cost'] > 600.0), :]
# 此處導致CSV文件的Cost列的$消失了
# 下面的語句並沒有將$加上去,暫時不知道怎么弄
data_frame['Cost'] = '$' + str(data_frame['Cost'])
# 將data_frame_value_meets_condition寫入輸出文件
data_frame_value_meets_condition.to_csv(output_file, index=False)
# 20181113
# csv_pandas_2
#!/usr/bin/env python3
# 導入CSV庫
import csv
input_file = 'D:\wangm\Documents\learning\code\python\supplier_data.csv'
output_file = 'D:\wangm\Documents\learning\code\python\supplier_data_out.csv'
important_dates = ['1/1/2018', '2/1/2018']
with open(input_file, 'r', newline='') as csv_in_file:
    with open(output_file, 'w', newline='') as csv_out_file:
        filereader = csv.reader(csv_in_file)
        filewriter = csv.writer(csv_out_file)
        header = next(filereader)
        filewriter.writerow(header)
        for row_list in filereader:
            a_date = row_list[4]
            # 選取date值在important_dates中的行
            if a_date in important_dates:
                filewriter.writerow(row_list)
# # csv_pandas_2
#!/usr/bin/env python3
import pandas as pd
input_file = 'D:\wangm\Documents\learning\code\python\supplier_data.csv'
output_file = 'D:\wangm\Documents\learning\code\python\supplier_data_out.csv'
# 使用pandas庫函數pandas.read_csv()讀取一個CSV文件,並由此創建一個數據框對象
data_frame = pd.read_csv(input_file)
important_dates = ['1/1/2018', '2/1/2018']
# 選取date值在important_dates中的行
data_frame_value_set = data_frame.loc[data_frame['Date'].isin(important_dates), :]
data_frame_value_set.to_csv(output_file, index=False)
# 20181113
# csv_pandas_3
#!/usr/bin/env python3
# 導入CSV庫、正則表達式庫
import csv
import re
input_file = 'D:\wangm\Documents\learning\code\python\supplier_data.csv'
output_file = 'D:\wangm\Documents\learning\code\python\supplier_data_out.csv'
# 使用re.compile(正則表達式),創建一個正則表達式變量
# 元字符?P<my_pattern_group>捕獲了名為<my_pattern_group>的組中匹配了的字符串
# pattern表示滿足以:'001-'開頭,后面可跟除任意字串的字符串
# re.I表示大小寫敏感
pattern = re.compile(r'(?P<my_pattern_group>^001-.*)', re.I)
with open(input_file, 'r', newline='') as csv_in_file:
    with open(output_file, 'w', newline='') as csv_out_file:
        filereader = csv.reader(csv_in_file)
        filewriter = csv.writer(csv_out_file)
        header = next(filereader)
        filewriter.writerow(header)
        for row_list in filereader:
            id_number = row_list[1]
            if pattern.search(id_number):
                filewriter.writerow(row_list)
# 20181113
# csv_pandas_3
#!/usr/bin/env python3
import pandas as pd
input_file = 'D:\wangm\Documents\learning\code\python\supplier_data.csv'
output_file = 'D:\wangm\Documents\learning\code\python\supplier_data_out.csv'
# 使用pandas庫函數pandas.read_csv()讀取一個CSV文件,並由此創建一個數據框對象
data_frame = pd.read_csv(input_file)
# 篩選出ID值以001-開頭的行
data_frame_value_matches_pattern = data_frame.loc[data_frame['ID'].str.startswith("001-"), :]
data_frame_value_matches_pattern.to_csv(output_file, index=False)
# 選取CSV文件中符合條件的列

# 20181113
# csv_pandas_4
# 通過列索引值選取特定列
# 在只知道需要選取的列名稱時,我們可以通過列名稱取得相應的索引值,在進行選取
# 具體方法是判斷相應標題行每個元素是否在已知列名稱中,若是,記下該item的index
#!/usr/bin/env python3
import csv
input_file = 'D:\wangm\Documents\learning\code\python\supplier_data.csv'
output_file = 'D:\wangm\Documents\learning\code\python\supplier_data_out.csv'
my_columns = [0, 3]
with open(input_file, 'r', newline='') as csv_in_file:
    with open(output_file, 'w', newline='') as csv_out_file:
        filereader = csv.reader(csv_in_file)
        filewriter = csv.writer(csv_out_file)
        for row_list in filereader:
            # 每次向輸出文件中寫入的一行值
            row_list_output = []
            for index_value in my_columns:
                row_list_output.append(row_list[index_value])
            filewriter.writerow(row_list_output)
# 選取CSV文件中符合條件的列

# 20181113
# csv_pandas_4
# 通過列索引值選取特定列
# 在只知道需要選取的列名稱時,不需要像基本Python一樣處理標題行,pandas可以將列名稱當做index一樣處理
#!/usr/bin/env python3
import pandas as pd
input_file = 'D:\wangm\Documents\learning\code\python\supplier_data.csv'
output_file = 'D:\wangm\Documents\learning\code\python\supplier_data_out.csv'
# 使用pandas庫函數pandas.read_csv()讀取一個CSV文件,並由此創建一個數據框對象
data_frame = pd.read_csv(input_file)
# 選取data_frame數據框對象中的所有行的列索引值為0,3的列
# iloc(行,列)函數可以選取數據框中選定的行、列
data_frame_value_column_by_value = data_frame.iloc[:, [0, 3]]
# data_frame_value_column_by_value = data_frame.iloc[:, [‘Name’, 'Cost']]
data_frame_value_column_by_value.to_csv(output_file, index=False)

# 給一個CSV文件添加標題行,在基礎Python中,可能是將標題行通過csv庫的writerow()函數寫入
# 而pandas庫提供了更加簡單的方法
# title = [‘One’, 'Two'...]
# data_frame = pd.read_csv(input_file, header=None, names=title)
# 讀取多個CSV文件,輸出讀取了多少個CSV文件
#!/usr/bin/env python3
import csv
import glob
import os

input_path = 'D:\wangm\Documents\learning\code\python'
file_counter = 0
for input_file in glob.glob(os.path.join(input_path, '*.csv')):
    file_counter = file_counter + 1
    #row_counter = 1
    #with open(input_file, 'r', newline='') as csv_input_file:
        #filereader = csv.reader(csv_input_file)
        #... 
print(file_counter)
# 20181114
# 合並多個CSV文件
#!/usv/bin/env python3
import pandas as pd
import os
import glob
input_path = 'D:\wangm\Documents\learning\code\python'
output_file = 'D:\wangm\Documents\learning\code\python\supplier_data_out.csv'
#all_files = glob.glob(os.path.join(input_path, 'supplier_data_副本*'))
# OSError: Initializing from file failed上面這句出現錯誤,因為文件名含有中文,改成下面這句就行了
all_files = glob.glob(os.path.join(input_path, 'supplier_data_copy*'))
all_data_frame = []
for file in all_files:
    data_frame = pd.read_csv(file, index_col=None)
    all_data_frame.append(data_frame)
# pandas.concat()函數將數據框數據垂直堆疊(axis=0), 當水平連接數據時(asis=1)
data_frame_concat = pd.concat(all_data_frame, axis=0, ignore_index=True)
data_frame_concat.to_csv(output_file, index=False)
# 分別計算多個CSV文件中的某項數據的和、平均值等
# 在基本python中,可以讀取多個CSV文件,然后要被計算的項的值一個一個取出來,然后計算
# 這里展示了使用pandas提供的方法 
#!/usv/bin/env python3
import pandas as pd
import os
import glob
input_path = 'D:\wangm\Documents\learning\code\python'
output_file = 'D:\wangm\Documents\learning\code\python\supplier_data_out.csv'
all_files = glob.glob(os.path.join(input_path, 'supplier_data_copy*'))
all_data_frame = []
for file in all_files:
    data_frame = pd.read_csv(file, index_col=None)
    #
    total_cost = pd.DataFrame([float(str(value).strip('$').replace(',', '')) \
                               for value in data_frame.loc[:, 'Cost']]).sum()
    # 平均值
    average_cost = pd.DataFrame([float(str(value).strip('$').replace(',', '')) \
                               for value in data_frame.loc[:, 'Cost']]).mean()
    data = {'file_name': os.path.basename(file),
            'total_cost': total_cost,
            'average_cost': average_cost}
    all_data_frame.append(pd.DataFrame(data, columns=['file_name', 'total_cost', 'average_cost']))
data_frames_concat = pd.concat(all_data_frame, axis=0, ignore_index=True)
data_frames_concat.to_csv(output_file, index=False)

代碼示例中使用的CSV文件:

上述代碼分別使用CSV庫、pandas庫來對CSV文件進行相同的操作

上述代碼運行在Python 3.6版本下,在Win10、Spyder中

有關Python的csv庫的詳細介紹:https://docs.python.org/2/library/csv.html


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM