一.數據分析的步驟:
1.查看數據並提出問題
2.數據清洗
3.代碼編寫,提取出結果數據,並分析是否有異常數據,修改代碼
4.根據數據選擇合適的圖表進行展示
5.根據圖表小組討論交流獲得最終的結果
二.環境與原始數據准備
安裝Anaconda2版本,同時更新軟件包更新最新版本 conda upgrade --all
下載first.zip文件,解壓
里面有3張csv文件分別是enrollments.csv,daily_engagements.csv,project_submission.csv和一個ipython的notebook
啟動cmd 切換到解壓之后的文件 輸入 jupyter notebook 啟動ipython筆記本
三.分析數據
1.從csv加載數據
import unicodecsv def readcsv(filename): with open(filename,'rb') as f: #以字典的形式存放每一行數據 reader = unicodecsv.DictReader(f) return list(reader)
## 從 daily_engagement.csv 和 project_submissions.csv 載入數據並存 ## 儲至下面的變量中,然后檢查每張表的第1行。 daily_engagement = readcsv('daily-engagement.csv') project_submissions = readcsv('project-submissions.csv') enrollments = readcsv('enrollments.csv') print daily_engagement[0] print project_submissions[0] print enrollments[0]
2.修正數據類型
from datetime import datetime as dt # 將字符串格式的時間轉為 Python datetime 類型的時間。 # 如果沒有時間字符串傳入,返回 None def parse_date(date): if date == '': return None else: return dt.strptime(date, '%Y-%m-%d') # 將可能是空字符串或字符串類型的數據轉為 整型 或 None。 def parse_maybe_int(i): if i == '': return None else: return int(i)
# 清理 enrollments 表格中的數據類型(取消的日期,參加日期,退出的天數,是否取消,是否是Udacity測試賬號) for enrollment in enrollments: enrollment['cancel_date'] = parse_date(enrollment['cancel_date']) enrollment['join_date'] = parse_date(enrollment['join_date']) enrollment['days_to_cancel'] = parse_maybe_int(enrollment['days_to_cancel']) enrollment['is_canceled'] = enrollment['is_canceled'] == 'True' enrollment['is_udacity'] = enrollment['is_udacity'] == 'True' enrollments[0] # 清理 engagement 的數據類型(時間,課程數量,課程完成數量,項目完成情況,共花費多少時間) for engagement_record in daily_engagement: engagement_record['utc_date'] = parse_date(engagement_record['utc_date']) engagement_record['num_courses_visited'] = int(float(engagement_record['num_courses_visited'])) engagement_record['lessons_completed'] = int(float(engagement_record['lessons_completed'])) engagement_record['projects_completed'] = int(float(engagement_record['projects_completed'])) engagement_record['total_minutes_visited'] = float(engagement_record['total_minutes_visited']) daily_engagement[0] # 清理 submissions 的數據類型(項目創建的時間,完成的時間) for submission in project_submissions: submission['creation_date'] = parse_date(submission['creation_date']) submission['completion_date'] = parse_date(submission['completion_date']) project_submissions[0]
3.修改數據中的格式問題
## 將 daily_engagement 表中的 "acct" 重命名為 ”account_key" for engagement_record in daily_engagement: engagement_record['account_key'] = engagement_record['acct'] del [engagement_record['acct']]
4.探索數據
## 計算每張表中的總行數,和獨立學生(擁有獨立的 account keys)的數量 def unique_student_data(data): unique_data = set() for data_point in data: unique_data.add(data_point['account_key']) return unique_data len(enrollments) unique_enrolled_students = unique_student_data(enrollments) len(unique_enrolled_students) len(daily_engagement) unique_daily_engagement = unique_student_data(daily_engagement) len(unique_daily_engagement) len(project_submissions) unique_project_submissions = unique_student_data(project_submissions) len(unique_project_submissions)
5.找出問題數據
## 計算出有問題的數據點條數(在 enrollments 中存在,但在 engagement 表中缺失) num_problem_students = 0 for enrollment in enrollments: if enrollment['account_key'] not in unique_daily_engagement and enrollment['join_date'] != enrollment['cancel_date']: num_problem_students +=1 print enrollment print num_problem_students
6.追蹤剩余的問題(移除數據集的測試賬號)
# 為所有 Udacity 測試帳號建立一組 set udacity_test_account = set() for enrollment in enrollments: if enrollment['is_udacity']: udacity_test_account.add(enrollment['account_key']) len(udacity_test_account) # 通過 account_key 刪除所有 Udacity 的測試帳號 def remove_udacity_account(data): non_udacity_data = [] for data_point in data: if data_point['account_key'] not in udacity_test_account: non_udacity_data.append(data_point) return non_udacity_data # 從3張表中移除所有 Udacity 的測試帳號 non_udacity_enrollments = remove_udacity_account(enrollments) non_udacity_engagement = remove_udacity_account(daily_engagement) non_udacity_submissions = remove_udacity_account(project_submissions)
#創建一個叫 paid_students 的字典,並在字典中存儲所有還沒有取消或者注冊時間超過7天的學生 paid_students = {} for enrollment in non_udacity_enrollments:
#如果沒有取消並且退課的期限已經超過,就記錄學生的key和報名時間 if not enrollment['is_canceled'] or enrollment['days_to_cancel'] > 7: account_key = enrollment['account_key'] enrollment_date = enrollment['join_date']
#如果account_key不在已繳費的記錄中,則將學生記錄添加進paid_student中 if account_key not in paid_students or enrollment_date > paid_students[account_key]: paid_students[account_key] = enrollment_date len(paid_students)#獲取了所有已入學的學生記錄
7.獲取第一周就已經付費報名的學生
#計算時間差,一周以內,按天計算
def within_one_week(join_date ,engagement_date): time_delta = join_date - enrollment_date return time_delta.days >= 0 and time_delta.days < 7
#存放已報名的用戶 def remove_free_trial_cancels(data): new_data = [] for data_point in data: if data_point['account_key'] in paid_students: new_data.append(data_point) return new_data paid_enrollment = remove_free_trial_cancels(non_udacity_enrollments) paid_engagement = remove_free_trial_cancels(non_udacity_engagement) paid_project_missions = remove_free_trial_cancels(non_udacity_submissions) print len(paid_enrollment) print len(paid_engagement) print len(paid_project_missions)
## 創建一個 engagement 記錄的列表,該列表只包括付費學生以及加入的前7天的學生的記錄
## 輸入符合要求的行數 paid_engagement_in_first_week = [] for engagement_record in paid_engagement: join_date = paid_students[engagement_record['account_key']] engagement_record_date = engagement_record['utc_date'] if within_one_week(join_date,engagement_record_date): paid_engagement_in_first_week.append(engagement_record) len(paid_engagement_in_first_week)
from collections import defaultdict import numpy as np #創建基於 student 對 engagement 進行分組的字典,字典的鍵為帳號(account key),值為包含互動記錄的列表 def group_data(data,key_name): grouped_data = defaultdict(list) for data_point in data: key = data_point[key_name] grouped_data[key].append(data_point) return grouped_data # 創建一個包含學生在第1周在教室所花總時間和字典。鍵為帳號(account key),值為數字(所花總時間) def sum_grouped_items(grouped_data,field_name): sumed_data = {} for key,data_points in grouped_data.items(): total = 0 for data_point in data_points: total += data_point[field_name] sumed_data[key] = total return sumed_data # 匯總和描述關於教室所花時間的數據 def describe_data(data): print 'Mean:', np.mean(data) print 'Standard deviation:', np.std(data) print 'Minimum:', np.min(data) print 'Maximum:', np.max(data)
8.獲取學習時間最長的學生和時間
total_minutes_by_account = sum_grouped_items(engagement_by_account,'total_minutes_visited') student_with_max_minutes = None max_minutes = 0 for student,total_nums in total_minutes_by_account.items(): if total_nums > max_minutes: max_minutes = total_nums student_with_max_minutes = student print max_minutes for engagement_record in paid_engagement_in_first_week: if engagement_record['account_key'] == student: print engagement_record
9.找出第一周的訪問數
## 找出第1周學生訪問教室天數的平均值、標准差、最小值、最大值。 for engagement_record in paid_engagement: if engagement_record['num_courses_visited'] > 0: engagement_record['has_visited'] = 1 else: engagement_record['has_visited'] = 0 days_visited_by_account = sum_grouped_items(engagement_by_account,'has_visited') describe_data(days_visited_by_account.values())
10.區分項目通過的學生
## 創建兩個付費學生第1周的互動數據列表(engagement)。第1個包含通過項目的學生,第2個包含沒通過項目的學生。 subway_project_lesson_keys = ['746169184', '3176718735']
#定義存放通過項目的學員的key pass_subway_project = set() for submission in paid_project_missions: project = submission['lesson_key'] rating = submission['assigned_rating']
#如果等級是passed和distinction加入到pass_subway_project集合中 if project in subway_project_lesson_keys and (rating == 'PASSED' or rating == 'DISTINCTION'): pass_subway_project.add(submission['account_key']) passing_engagement = [] #存放通過項目的學生 non_passing_engagement =[] #存放沒有通過項目的學生 for engagement_record in paid_engagement_in_first_week: if engagement_record['account_key'] in pass_subway_project: passing_engagement.append(engagement_record) else: non_passing_engagement.append(engagement_record) print len(passing_engagement) print len(non_passing_engagement)
11.對比兩組學生的數據
## 計算你所感興趣的數據指標,並分析通過項目和沒有通過項目的兩組學生有何異同。 ## 你可以從我們之前使用過的數據指標開始(教室的訪問時間、課程完成數、訪問天數)。 passing_engagement_by_account = group_data(passing_engagement,'account_key') non_passing_engagement_by_account = group_data(non_passing_engagement,'account_key') print 'non-passing students' non_passing_minute = sum_grouped_items(non_passing_engagement_by_account,'total_minutes_visited') describe_data(non_passing_minute.values()) print 'passing students' passing_minute = sum_grouped_items(passing_engagement_by_account,'total_minutes_visited') describe_data(passing_minute.values()) print 'non-passing lessons' non_passing_lessons = sum_grouped_items(non_passing_engagement_by_account,'lessons_completed') describe_data(non_passing_lessons.values()) print 'passing lessons' passing_lessons = sum_grouped_items(passing_engagement_by_account,'lessons_completed') describe_data(passing_lessons.values()) print 'non-passing visited' non_passing_visited = sum_grouped_items(non_passing_engagement_by_account,'has_visited') describe_data(non_passing_visited.values()) print 'passing visited' passing_visited = sum_grouped_items(passing_engagement_by_account,'has_visited') describe_data(passing_visited.values())
12.繪制直方圖
%pylab inline import matplotlib.pyplot as plt import numpy as np def describe_data(data): print 'Mean:', np.mean(data) print 'Standard deviation:', np.std(data) print 'Minimum:', np.min(data) print 'Maximum:', np.max(data) plt.hist(data) describe_data(passing_minute.values()) describe_data(non_passing_minute.values())
13.改進圖表並分析
## 至少改進一幅之前的可視化圖表,嘗試導入 seaborn 庫使你的圖表看起來更美觀。 ## 加入軸標簽及表頭,並修改一個或多個 hist() 內的變量。 %pylab inline import seaborn as sns sns.set(color_codes=True) plt.hist(non_passing_minute.values(),bins=8) plt.xlabel('mean of minut') plt.title('Distribution of classroom visits in the first week ' + 'for students who do not pass the subway project') plt.hist(passing_minute.values(),bins=8) plt.xlabel('mean of minut') plt.title('Distribution of classroom visits in the first week ' + 'for students who do not pass the subway project')