pandas模塊(數據分析)------dataframe


 DataFrame

DataFrame是一個表格型的數據結構,含有一組有序的列,是一個二維結構。

DataFrame可以被看做是由Series組成的字典,並且共用一個索引。

一、生成方式

import numpy as np
import pandas as pd
a=pd.DataFrame({'one':pd.Series([1,2,3],index=['a','b','c']), 'two':pd.Series([1,2,3,4],index=['b','a','c','d'])})
a

可以看出 有one和two兩個Series組成,並且共用一組索引a,b,c,d

# 字典方式創建

b=pd.DataFrame({"today":[12,43,23,123],"tomory":[23,45,34,23]})
b

# 自定義索引

c=pd.DataFrame({"today":[12,43,23,123],"tomory":[23,45,34,23]},index=list("abcd"))
c

二、csv文件讀取與寫入

df = pd.read_csv("d:/601318.csv")
df

2470 rows × 8 columns

x=open("d:/601318.csv")
df=pd.read_csv(x)
df

2470 rows × 8 columns

 1 #  保存到文件
 2 df.to_csv("d:/new.csv")
 3 
 4 
 5 # index                 獲取行索引
 6 df.index
 7 
 8 RangeIndex(start=0, stop=2470, step=1)
 9 
10 a.index
11 
12 Index(['a', 'b', 'c', 'd'], dtype='object')
13 
14 
15 # 返回列索引
16 df.columns
17 
18 Index(['id', 'date', 'open', 'close', 'high', 'low', 'volume', 'code'], dtype='object')
19 
20 
21 #  values  返回二維數組
22 df.values
23 
24 array([
25         [0, '2007/3/1', 22.074, ..., 20.22, 1977633.51, 601318],
26         [1, '2007/3/2', 20.75, ..., 20.256, 425048.32, 601318],
27         [2, '2007/3/5', 20.3, ..., 19.218, 419196.74, 601318],
28         ..., 
29         [2467, '2017/7/28', 52.2, ..., 51.8, 491294.0, 601318],
30         [2468, '2017/7/31', 51.88, ..., 51.41, 616005.0, 601318],
31         [2469, '2017/8/1', 52.2, ..., 52.2, 1147936.0, 601318]
32         ], 
33         dtype=object)
34 
35 
36 # 倒置  行和列交換
37 
38 a.T

#  describe 按列打印一些統計信息

df.describe()

#  df 的columns 和index都有name屬性

# 上面的數據中的index的name還沒有值,可以設置一個
df.index.name='indexname'
df

2470 rows × 8 columns

#獲取第一列的name
df.columns[0]
'id'


df.columns[1]
'date'


#  給列重命名,並沒有修改原數據,這是下面是返回的數據
df.rename(columns={"close":"newclose","low":"newlow"})

2470 rows × 8 columns

三、索引和切片

df[0]
    ---------------------------------------------------------------------------

    KeyError                                  Traceback (most recent call last)

    d:\program files (x86)\python35\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
       2441             try:
    -> 2442                 return self._engine.get_loc(key)
       2443             except KeyError:


    pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5280)()


    pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5126)()


    pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20523)()


    pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20477)()


    KeyError: 0


    During handling of the above exception, another exception occurred:


    KeyError                                  Traceback (most recent call last)

    <ipython-input-18-9ae93f22b889> in <module>()
    ----> 1 df[0]


    d:\program files (x86)\python35\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
       1962             return self._getitem_multilevel(key)
       1963         else:
    -> 1964             return self._getitem_column(key)
       1965 
       1966     def _getitem_column(self, key):


    d:\program files (x86)\python35\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
       1969         # get column
       1970         if self.columns.is_unique:
    -> 1971             return self._get_item_cache(key)
       1972 
       1973         # duplicate columns & possible reduce dimensionality


    d:\program files (x86)\python35\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
       1643         res = cache.get(item)
       1644         if res is None:
    -> 1645             values = self._data.get(item)
       1646             res = self._box_item_values(item, values)
       1647             cache[item] = res


    d:\program files (x86)\python35\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
       3588 
       3589             if not isnull(item):
    -> 3590                 loc = self.items.get_loc(item)
       3591             else:
       3592                 indexer = np.arange(len(self.items))[isnull(self.items)]


    d:\program files (x86)\python35\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
       2442                 return self._engine.get_loc(key)
       2443             except KeyError:
    -> 2444                 return self._engine.get_loc(self._maybe_cast_indexer(key))
       2445 
       2446         indexer = self.get_indexer([key], method=method, tolerance=tolerance)


    pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5280)()


    pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5126)()


    pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20523)()


    pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20477)()


    KeyError: 0
df["close"]
    indexname
    0       20.657
    1       20.489
    2       19.593
    3       19.977
    4       20.520
    5       20.273
    6       20.101
    7       19.739
    8       19.818
    9       19.841
    10      19.849
    11      19.960
    12      20.211
    13      19.911
    14      20.026
    15      19.938
    16      20.282
    17      20.269
    18      20.565
    19      20.927
    20      20.772
    21      21.364
    22      21.284
    23      21.099
    24      21.156
    25      21.196
    26      22.785
    27      23.319
    28      23.637
    29      23.593
             ...  
    2440    48.896
    2441    48.609
    2442    49.183
    2443    49.183
    2444    49.381
    2445    48.085
    2446    49.420
    2447    49.074
    2448    48.411
    2449    47.403
    2450    49.876
    2451    50.835
    2452    50.459
    2453    50.578
    2454    51.230
    2455    50.610
    2456    51.630
    2457    52.770
    2458    53.900
    2459    53.470
    2460    53.840
    2461    54.010
    2462    51.960
    2463    52.610
    2464    52.310
    2465    51.890
    2466    52.360
    2467    51.890
    2468    52.020
    2469    54.850
    Name: close, Length: 2470, dtype: float64

從上邊可以看出,[]里邊似乎要用來選擇列才可以(后面知道,切片也可以)

# 花式索引

df[["close","low"]]

2470 rows × 2 columns

df["close"][0]

20.656999999999996

df[“close”] 先得到一個Series,然后 再用標簽索引0去查找

df[["close","low"]][0]
 1     ---------------------------------------------------------------------------
 2 
 3     KeyError                                  Traceback (most recent call last)
 4 
 5     d:\program files (x86)\python35\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
 6        2441             try:
 7     -> 2442                 return self._engine.get_loc(key)
 8        2443             except KeyError:
 9 
10 
11     pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5280)()
12 
13 
14     pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5126)()
15 
16 
17     pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20523)()
18 
19 
20     pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20477)()
21 
22 
23     KeyError: 0
24 
25 
26     During handling of the above exception, another exception occurred:
27 
28 
29     KeyError                                  Traceback (most recent call last)
30 
31     <ipython-input-22-7ed9e36ec1ab> in <module>()
32     ----> 1 df[["close","low"]][0]
33 
34 
35     d:\program files (x86)\python35\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
36        1962             return self._getitem_multilevel(key)
37        1963         else:
38     -> 1964             return self._getitem_column(key)
39        1965 
40        1966     def _getitem_column(self, key):
41 
42 
43     d:\program files (x86)\python35\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
44        1969         # get column
45        1970         if self.columns.is_unique:
46     -> 1971             return self._get_item_cache(key)
47        1972 
48        1973         # duplicate columns & possible reduce dimensionality
49 
50 
51     d:\program files (x86)\python35\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
52        1643         res = cache.get(item)
53        1644         if res is None:
54     -> 1645             values = self._data.get(item)
55        1646             res = self._box_item_values(item, values)
56        1647             cache[item] = res
57 
58 
59     d:\program files (x86)\python35\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
60        3588 
61        3589             if not isnull(item):
62     -> 3590                 loc = self.items.get_loc(item)
63        3591             else:
64        3592                 indexer = np.arange(len(self.items))[isnull(self.items)]
65 
66 
67     d:\program files (x86)\python35\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
68        2442                 return self._engine.get_loc(key)
69        2443             except KeyError:
70     -> 2444                 return self._engine.get_loc(self._maybe_cast_indexer(key))
71        2445 
72        2446         indexer = self.get_indexer([key], method=method, tolerance=tolerance)
73 
74 
75     pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5280)()
76 
77 
78     pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5126)()
79 
80 
81     pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20523)()
82 
83 
84     pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20477)()
85 
86 
87     KeyError: 0

之所以報錯,是因為df[[“close”,“low”]]得到的是一個DataFrame類型,它再加[],[]里邊只能是列

# 切片,這個時候解釋的就是行

df[0:10]

推薦使用loc和iloc索引

# 在loc里邊,逗號左邊表示行,右邊表示列

# 在這里的0:10被解釋為標簽(不是行的下標)
ddf=df.loc[3:10,["close","low"]]
ddf

#  那我現在想拿到ddf里,"low"列,第5行的數據

# ddf["low"]得到的是一個Series,其索引是整數的,所以必須使用iloc指明使用下標取值
ddf["low"].iloc[4]

19.646000000000001

布爾值索引

# 過濾某一列

df[df["close"]<20]

856 rows × 8 columns

# 過濾所有的位置

# dataframe會將所有位置上小於20的設置為nan(因為其不能確定該怎么舍棄數據,不可能因為一行中一個nan就刪除整個一行或者一列)

df[df<20]

2470 rows × 8 columns

#  將所有小於20的值改為0

# 請注意這里,會將為False的位置改為0,所以我們要寫大於20,這樣的話小於20的才是False
df[df>20].fillna(0)

2470 rows × 8 columns

#  選擇date 為2017/7/25 和2017/7/3 的值

# 這里的date是字符串類型,不是datetime類型

df[(df["date"]=="2017/7/25") | (df["date"]=="2017/7/3")]

#  這里還可以用isin方法去過濾一個范圍

df[df["date"].isin(["2017/7/25","2017/7/3"])]

df[df["high"].isin([53.050,54.150])]

修改值的時候要注意類型的問題

# 比如要將所有小於20的位置變為0

# 做法一:
df[df>20].fillna(0)

# 做法二:等號賦值
df[df<20]=0
    ---------------------------------------------------------------------------

    TypeError                                 Traceback (most recent call last)

    <ipython-input-45-ea838d192259> in <module>()
          5 
          6 # 做大二:等號賦值
    ----> 7 df[df<20]=0


    d:\program files (x86)\python35\lib\site-packages\pandas\core\frame.py in __setitem__(self, key, value)
       2326             self._setitem_array(key, value)
       2327         elif isinstance(key, DataFrame):
    -> 2328             self._setitem_frame(key, value)
       2329         else:
       2330             # set column


    d:\program files (x86)\python35\lib\site-packages\pandas\core\frame.py in _setitem_frame(self, key, value)
       2362             raise TypeError('Must pass DataFrame with boolean values only')
       2363 
    -> 2364         self._check_inplace_setting(value)
       2365         self._check_setitem_copy()
       2366         self._where(-key, value, inplace=True)


    d:\program files (x86)\python35\lib\site-packages\pandas\core\generic.py in _check_inplace_setting(self, value)
       3197                     pass
       3198 
    -> 3199                 raise TypeError('Cannot do inplace boolean setting on '
       3200                                 'mixed-types with a non np.nan value')
       3201 


    TypeError: Cannot do inplace boolean setting on mixed-types with a non np.nan value

報錯的原因是因為,date這列是字符串類型,設置為0,類型轉換失敗

# 現在通過切片,去掉date列,看能否轉換成功

df2=df.loc[:10,"open":"code"]
df2

df2[df2<20]=0
df2

可以看出,如果列里邊沒有字符串類型,是可以轉換成功的

四、數據對齊和數據缺失

df3=df + df2
df3

2470 rows × 8 columns

新的數據,列和行都要對齊,列date和id都是nan,是因為df2中沒有這兩列,這些其實跟Series的道理是一樣的

處理缺失數據的相關方法:

  • dropna() 過濾掉值為NaN的行
  • fillna() 填充缺失數據
  • isnull() 返回布爾數組,缺失值對應為True
  • notnull() 返回布爾數組,缺失值對應為False

跟Series的方法是一樣的

df3.dropna()

在這里,dropna默認的規則,只要行里有nan,就會清除掉整行,但是可以設置參數去改變

df3.dropna(how="any") ---->默認是any,只要有nan就刪除;how='all'的話,就是行里全是nan才刪除

那如果我想對列進行操作,就還需要另外一個才做,要記住默認的規則是對行的

df3.dropna(how="any",axis=0)--->axis默認等於0,表示是對行進行規則,axis=1的話,就表示對列進行規則

  • df3.dropna(how="any",axis=0)--->清除掉行里含有nan的行
  • df3.dropna(how="all",axis=0)--->清除掉行里都是nana的行
  • df3.dropna(how="any",axis=1)--->清除掉列里含有nan的列
  • df3.dropna(how="all",axis=1)--->清除掉列里都是nana的列
# 將位置是nan的地方替換為0

df3.fillna(0)

2470 rows × 8 columns

五、常用函數

mean 得出每個列的平均值

df2.mean()
    open          11.258000
    close          9.276364
    high          15.107000
    low            5.513000
    volume    388403.913636
    code      601318.000000
    dtype: float64
#  單列的平均值(Series)

df2["close"].mean()

9.2763636363636355

sum 求出每列的和
字符串的話,就是字符串的拼接

df.sum()
    id                                                  3049215
    date      2007/3/12007/3/22007/3/52007/3/62007/3/72007/3...
    open                                                63999.2
    close                                               64054.2
    high                                                65113.7
    low                                                 63035.4
    volume                                          1.18105e+09
    code                                             1485255460
    dtype: object

sort 排序
sort_index 按照索引排序(行索引和列索引)
ascending默認為True ,表示按照升序排序;False表示降序
axis為0 ,代表按行索引;1代表用列索引 - sort_index(ascending=False,axis=0) - sort_index(ascending=False,axis=0) - sort_index(ascending=False,axis=0) - sort_index(ascending=False,axis=0)

# ascending默認為True ,表示按照升序排序;False表示降序

df.sort_index(ascending=False)

2470 rows × 8 columns

# ascending默認為True ,表示按照升序排序;False表示降序

df.sort_index(ascending=False)

2470 rows × 8 columns

sort_values 按照值排序

# 按照close列升序排序

df2.sort_values("close")

# 按照close列降序

df2.sort_values("close",ascending=False)

1 # 按照close列升序排序,如果有close值相同,再按照low列排序
2 
3 df2.sort_values(["close","low"])

# axis=1,按照行排序,在這里一定要注意,必須保證這一行的數據類型是一致的,比如df中有字符串類型,就會報錯

# df2 行類的數據類型都是一致的是沒有問題的,第一個參數是說按照行的索引號,df中,0和1的結果就不一樣
df2.sort_values(0,axis=1)

df2.sort_values(1,axis=1)

numpy的通用函數用眼適用於pandas

# 請主要類型

df.abs()
 ---------------------------------------------------------------------------

    TypeError                                 Traceback (most recent call last)

    <ipython-input-98-db394c0c0cf4> in <module>()
          1 # 請主要類型
          2 
    ----> 3 df.abs()


    d:\program files (x86)\python35\lib\site-packages\pandas\core\generic.py in abs(self)
       5661         abs: type of caller
       5662         """
    -> 5663         return np.abs(self)
       5664 
       5665     def describe(self, percentiles=None, include=None, exclude=None):


    TypeError: bad operand type for abs(): 'str'
df2.abs()

六、自定義函數

applymap(函數名),作用域DataFrame上,這個的函數的應用是針對於df里的每個位置去執行

apply(函數名),作用域DataFrame上,將操作應用於整列或者整行上(整行要修改axis=1)

map作用於Series上

import numpy as np

import  pandas as pd
df=pd.read_csv("d:/601318.csv")
df

2470 rows × 8 columns

df2=df.loc[:15,"close":"code"]
df2

#df2中每個位置都是加10

df2.applymap(lambda x:x+10)

# map作用域Series

df4=df2["close"]
df4.map(lambda x:x+100)
    0     120.657
    1     120.489
    2     119.593
    3     119.977
    4     120.520
    5     120.273
    6     120.101
    7     119.739
    8     119.818
    9     119.841
    10    119.849
    11    119.960
    12    120.211
    13    119.911
    14    120.026
    15    119.938
    Name: close, dtype: float64
#apply 將操作應用到每一列上

df2.apply(lambda x:x.sum()+1)
    close         321.903
    high          328.752
    low           317.416
    volume    5166066.460
    code      9621089.000
    dtype: float64
#apply 將操作應用到每一行上

df2.apply(lambda x:x.sum()+1,axis=1)
pandas之dataframe(下)
自定義函數
applymap(函數名),作用域DataFrame上,這個的函數的應用是針對於df里的每個位置去執行

apply(函數名),作用域DataFrame上,將操作應用於整列或者整行上(整行要修改axis=1)

map作用於Series上

import numpy as np

import  pandas as pd
df=pd.read_csv("d:/601318.csv")
df
id    date    open    close    high    low    volume    code
0    0    2007/3/1    22.074    20.657    22.503    20.220    1977633.51    601318
1    1    2007/3/2    20.750    20.489    20.944    20.256    425048.32    601318
2    2    2007/3/5    20.300    19.593    20.384    19.218    419196.74    601318
3    3    2007/3/6    19.426    19.977    20.308    19.315    297727.88    601318
4    4    2007/3/7    19.995    20.520    20.706    19.827    287463.78    601318
5    5    2007/3/8    20.353    20.273    20.454    20.167    130983.83    601318
6    6    2007/3/9    20.264    20.101    20.353    19.735    160887.79    601318
7    7    2007/3/12    19.999    19.739    19.999    19.646    145353.06    601318
8    8    2007/3/13    19.783    19.818    19.982    19.699    102319.68    601318
9    9    2007/3/14    19.558    19.841    19.911    19.333    173306.56    601318
10    10    2007/3/15    20.097    19.849    20.525    19.779    152521.90    601318
11    11    2007/3/16    19.863    19.960    20.286    19.602    227547.24    601318
12    12    2007/3/20    20.662    20.211    20.715    20.088    222026.87    601318
13    13    2007/3/21    20.220    19.911    20.308    19.823    136728.32    601318
14    14    2007/3/22    20.066    20.026    20.273    19.969    167509.84    601318
15    15    2007/3/23    20.017    19.938    20.101    19.739    139810.14    601318
16    16    2007/3/26    19.955    20.282    20.397    19.946    223266.79    601318
17    17    2007/3/27    20.216    20.269    20.467    20.145    139338.19    601318
18    18    2007/3/28    20.264    20.565    20.706    20.123    258263.69    601318
19    19    2007/3/29    20.666    20.927    21.540    20.520    461986.18    601318
20    20    2007/3/30    20.732    20.772    21.134    20.626    144617.20    601318
21    21    2007/4/2    20.772    21.364    21.501    20.772    231445.03    601318
22    22    2007/4/3    21.377    21.284    21.527    21.147    132712.04    601318
23    23    2007/4/4    21.289    21.099    21.412    20.993    122454.69    601318
24    24    2007/4/5    21.103    21.156    21.191    20.838    122865.38    601318
25    25    2007/4/6    21.050    21.196    21.611    20.971    195208.52    601318
26    26    2007/4/9    21.231    22.785    22.909    21.059    462770.21    601318
27    27    2007/4/10    22.516    23.319    23.699    22.516    407823.90    601318
28    28    2007/4/11    23.346    23.637    24.361    23.222    243446.50    601318
29    29    2007/4/12    23.832    23.593    25.606    23.377    159270.43    601318
...    ...    ...    ...    ...    ...    ...    ...    ...
2440    2440    2017/6/21    47.778    48.896    49.025    47.046    849757.00    601318
2441    2441    2017/6/22    48.669    48.609    49.925    48.520    1146464.00    601318
2442    2442    2017/6/23    48.708    49.183    49.361    48.263    873719.00    601318
2443    2443    2017/6/26    49.450    49.183    50.222    48.817    953192.00    601318
2444    2444    2017/6/27    49.163    49.381    49.411    48.402    780835.00    601318
2445    2445    2017/6/28    49.163    48.085    49.203    48.026    691322.00    601318
2446    2446    2017/6/29    48.273    49.420    49.510    47.858    753228.00    601318
2447    2447    2017/6/30    49.262    49.074    49.658    48.748    598630.00    601318
2448    2448    2017/7/3    49.262    48.411    49.262    48.026    563199.00    601318
2449    2449    2017/7/4    48.273    47.403    48.313    47.393    683920.00    601318
2450    2450    2017/7/5    47.482    49.876    50.152    47.482    1272537.00    601318
2451    2451    2017/7/6    49.876    50.835    51.438    49.529    1137814.00    601318
2452    2452    2017/7/7    50.598    50.459    51.063    49.984    533925.00    601318
2453    2453    2017/7/10    50.469    50.578    51.399    50.143    570776.00    601318
2454    2454    2017/7/11    50.810    51.230    52.010    50.610    699539.00    601318
2455    2455    2017/7/12    51.360    50.610    52.500    50.420    870117.00    601318
2456    2456    2017/7/13    50.980    51.630    51.860    50.830    665342.00    601318
2457    2457    2017/7/14    51.690    52.770    52.790    51.300    707791.00    601318
2458    2458    2017/7/17    53.010    53.900    55.090    52.420    1408791.00    601318
2459    2459    2017/7/18    53.600    53.470    54.260    52.510    879029.00    601318
2460    2460    2017/7/19    53.680    53.840    54.480    53.110    771180.00    601318
2461    2461    2017/7/20    53.550    54.010    54.150    52.820    659198.00    601318
2462    2462    2017/7/21    53.200    51.960    53.280    51.900    1294791.00    601318
2463    2463    2017/7/24    52.080    52.610    53.100    51.680    904595.00    601318
2464    2464    2017/7/25    52.620    52.310    53.050    52.180    506834.00    601318
2465    2465    2017/7/26    52.100    51.890    52.500    51.280    657610.00    601318
2466    2466    2017/7/27    51.850    52.360    52.740    51.090    667132.00    601318
2467    2467    2017/7/28    52.200    51.890    52.460    51.800    491294.00    601318
2468    2468    2017/7/31    51.880    52.020    52.640    51.410    616005.00    601318
2469    2469    2017/8/1    52.200    54.850    54.900    52.200    1147936.00    601318
2470 rows × 8 columns

df2=df.loc[:15,"close":"code"]
df2
close    high    low    volume    code
0    20.657    22.503    20.220    1977633.51    601318
1    20.489    20.944    20.256    425048.32    601318
2    19.593    20.384    19.218    419196.74    601318
3    19.977    20.308    19.315    297727.88    601318
4    20.520    20.706    19.827    287463.78    601318
5    20.273    20.454    20.167    130983.83    601318
6    20.101    20.353    19.735    160887.79    601318
7    19.739    19.999    19.646    145353.06    601318
8    19.818    19.982    19.699    102319.68    601318
9    19.841    19.911    19.333    173306.56    601318
10    19.849    20.525    19.779    152521.90    601318
11    19.960    20.286    19.602    227547.24    601318
12    20.211    20.715    20.088    222026.87    601318
13    19.911    20.308    19.823    136728.32    601318
14    20.026    20.273    19.969    167509.84    601318
15    19.938    20.101    19.739    139810.14    601318
#df2中每個位置都是加10

df2.applymap(lambda x:x+10)
close    high    low    volume    code
0    30.657    32.503    30.220    1977643.51    601328
1    30.489    30.944    30.256    425058.32    601328
2    29.593    30.384    29.218    419206.74    601328
3    29.977    30.308    29.315    297737.88    601328
4    30.520    30.706    29.827    287473.78    601328
5    30.273    30.454    30.167    130993.83    601328
6    30.101    30.353    29.735    160897.79    601328
7    29.739    29.999    29.646    145363.06    601328
8    29.818    29.982    29.699    102329.68    601328
9    29.841    29.911    29.333    173316.56    601328
10    29.849    30.525    29.779    152531.90    601328
11    29.960    30.286    29.602    227557.24    601328
12    30.211    30.715    30.088    222036.87    601328
13    29.911    30.308    29.823    136738.32    601328
14    30.026    30.273    29.969    167519.84    601328
15    29.938    30.101    29.739    139820.14    601328
# map作用域Series

df4=df2["close"]
df4.map(lambda x:x+100)

    0     120.657
    1     120.489
    2     119.593
    3     119.977
    4     120.520
    5     120.273
    6     120.101
    7     119.739
    8     119.818
    9     119.841
    10    119.849
    11    119.960
    12    120.211
    13    119.911
    14    120.026
    15    119.938
    Name: close, dtype: float64
#apply 將操作應用到每一列上

df2.apply(lambda x:x.sum()+1)

    close         321.903
    high          328.752
    low           317.416
    volume    5166066.460
    code      9621089.000
    dtype: float64
#apply 將操作應用到每一行上

df2.apply(lambda x:x.sum()+1,axis=1)

    0     2579015.890
    1     1026429.009
    2     1020574.935
    3      899106.480
    4      888843.833
    5      732363.724
    6      762266.979
    7      746731.444
    8      703698.179
    9      774684.645
    10     753901.053
    11     828926.088
    12     823406.884
    13     738107.362
    14     768889.108
    15     741188.918
    dtype: float64

# 層次索引 內容更新中....

# 從文件讀取 - read_csv:默認分隔符是逗號 - read_table:默認分隔符是/t(tab鍵) 參數: - sep 執行分隔符 - header=None 指定文件無列名 - names 指定列名 - index_col 指定某列作為索引 - skiprows 指定跳過哪一行 - na_values 指定某些字符串缺失值 - parse_dates 指定某些列是否被拆解為日期,布爾值或列表 - nrows 指定讀取幾行文件 - chunksize 分塊讀取文件,指定快大小

# read_table 默認是以/t(tab)為分割

pd.read_table("d:/new.csv")

pd.read_table("d:/new.csv",sep=",")

sep 還可以是正則表達式,比如 sep="\s+",表示任意長度的空白字符

#  在讀取數據的時候,會默認將第一列指定為列名,可以通過修改header=None,指定第一行不是列名
pd.read_table("d:/new.csv",sep=",",header=None)

當設置header=None時,會自動取一個列名0,1,2,3,4,5,6,7

# 如果想自己取一個列名,可以修改names

pd.read_table("d:/new.csv",sep=",",header=None,names=["id","date","open","close","high","low","volumw","code"])

#  還可以設置跳過哪些行

#完整的
pd.read_table("d:/new.csv",sep=",")

pd.read_table("d:/new.csv",sep=",",skiprows=[0])

從上邊可以看出。它跳是從表格的第一行開始,索引為0(在這里第一行列名就是索引0的位置)

pd.read_table("d:/new.csv",sep=",",skiprows=[1])

#  在導入的時候,默認會生成行索引,如果我們想使用某一列作為行索引,可以使用index_col,可以使用多列["id","close"]

df=pd.read_table("d:/new2.csv",sep=",",index_col=["id"])
df

df.loc[4:7,"close":"low"]

# 一般在實際場景中,我們經常用用date作為行索引

df=pd.read_table("d:/new2.csv",sep=",",index_col="date")
df

type(df.index[0])

str
#  這里的date是一個字符串,我們可以將這個date轉化為一個時間類型:設置parse_dates

df=pd.read_table("d:/new2.csv",sep=",",index_col="date",parse_dates=["date"])
type(df.index[0])

pandas._libs.tslib.Timestamp

在文件里如果有nan這個字符(我們之前講的是內存里邊nan),如何去識別?

# 設置na_values

# 凡是"nan","None","null","xxx"這樣的字符串都解析為nan,否則整列都被解析為字符串(記住,是整列,因為一列的數據類型必須一致)
df=pd.read_table("d:/new3.csv",sep=",")
df

df["id"][0]
'None'


type(df["id"].iloc[1])
str



df=pd.read_table("d:/new3.csv",sep=",",na_values=["nan","None","null","xxx"])
df

type(df["id"].iloc[1])

numpy.float64

# 寫入到文件 to_csv 主要參數: - sep 指定分隔符 - na_sep 指定缺失值轉換的字符串,默認為空字符串 - header=False 不輸出第一行的列名 - index=False 不輸出行的索引一列 - columns 輸出指定列

# 默認是行名和列名都輸出,缺失值轉換的字符串轉換為空

df.to_csv("d:/ceshi.csv",header=False,index=False,na_rep="DD",columns=["close"])

還可以導出成其它的文件類型:json,xml,Html,數據庫

# 時間序列

# to_datetime 可以將字符串轉換為一種特定的時間類型

pd.to_datetime(df["date"])
    0    2007-03-01
    1    2007-03-02
    2    2007-03-05
    3    2007-03-06
    4    2007-03-07
    5    2007-03-08
    6    2007-03-12
    7    2007-03-13
    8    2007-03-14
    9    2007-03-15
    10   2007-03-16
    11   2007-03-20
    12   2007-03-21
    13   2007-03-22
    Name: date, dtype: datetime64[ns]

時間處理對象:date_range
參數: - start 開始時間 - end 結束時間 - periods 時間長度 - freq 時間頻率,默認為"D",可選H(our),W(wwk),B(usiness),M(onth),S(econd),A(year),T

# date_range 產生一組時間

pd.date_range("2017-06-01","2017-07-01")
    DatetimeIndex(['2017-06-01', '2017-06-02', '2017-06-03', '2017-06-04',
                   '2017-06-05', '2017-06-06', '2017-06-07', '2017-06-08',
                   '2017-06-09', '2017-06-10', '2017-06-11', '2017-06-12',
                   '2017-06-13', '2017-06-14', '2017-06-15', '2017-06-16',
                   '2017-06-17', '2017-06-18', '2017-06-19', '2017-06-20',
                   '2017-06-21', '2017-06-22', '2017-06-23', '2017-06-24',
                   '2017-06-25', '2017-06-26', '2017-06-27', '2017-06-28',
                   '2017-06-29', '2017-06-30', '2017-07-01'],
                  dtype='datetime64[ns]', freq='D')
#  假如要每一周出一天(默認是每一天出一個)

# 這里是星期日為標准
pd.date_range("2017-06-01","2017-08-01",freq="W")

``` DatetimeIndex(['2017-06-04', '2017-06-11', '2017-06-18', '2017-06-25', '2017-07-02', '2017-07-09', '2017-07-16', '2017-07-23', '2017-07-30'], dtype='datetime64[ns]', freq='W-SUN')

```python
#  假如要只出工作日

pd.date_range("2017-06-01","2017-08-01",freq="B")
    DatetimeIndex(['2017-06-01', '2017-06-02', '2017-06-05', '2017-06-06',
                   '2017-06-07', '2017-06-08', '2017-06-09', '2017-06-12',
                   '2017-06-13', '2017-06-14', '2017-06-15', '2017-06-16',
                   '2017-06-19', '2017-06-20', '2017-06-21', '2017-06-22',
                   '2017-06-23', '2017-06-26', '2017-06-27', '2017-06-28',
                   '2017-06-29', '2017-06-30', '2017-07-03', '2017-07-04',
                   '2017-07-05', '2017-07-06', '2017-07-07', '2017-07-10',
                   '2017-07-11', '2017-07-12', '2017-07-13', '2017-07-14',
                   '2017-07-17', '2017-07-18', '2017-07-19', '2017-07-20',
                   '2017-07-21', '2017-07-24', '2017-07-25', '2017-07-26',
                   '2017-07-27', '2017-07-28', '2017-07-31', '2017-08-01'],
                  dtype='datetime64[ns]', freq='B')
#  半個月
pd.date_range("2017-06-01","2017-08-01",freq="SM")


DatetimeIndex(['2017-06-15', '2017-06-30', '2017-07-15', '2017-07-31'], dtype='datetime64[ns]', freq='SM-15')


#  一個月
pd.date_range("2017-06-01","2017-08-01",freq="M")

    DatetimeIndex(['2017-06-30', '2017-07-31'], dtype='datetime64[ns]', freq='M')


#  分鍾
pd.date_range("2017-06-01","2017-08-01",freq="T")
    DatetimeIndex(['2017-06-01 00:00:00', '2017-06-01 00:01:00',
                   '2017-06-01 00:02:00', '2017-06-01 00:03:00',
                   '2017-06-01 00:04:00', '2017-06-01 00:05:00',
                   '2017-06-01 00:06:00', '2017-06-01 00:07:00',
                   '2017-06-01 00:08:00', '2017-06-01 00:09:00',
                   ...
                   '2017-07-31 23:51:00', '2017-07-31 23:52:00',
                   '2017-07-31 23:53:00', '2017-07-31 23:54:00',
                   '2017-07-31 23:55:00', '2017-07-31 23:56:00',
                   '2017-07-31 23:57:00', '2017-07-31 23:58:00',
                   '2017-07-31 23:59:00', '2017-08-01 00:00:00'],
                  dtype='datetime64[ns]', length=87841, freq='T')
#
pd.date_range("2017-06-01","2019-08-01",freq="A")

    DatetimeIndex(['2017-12-31', '2018-12-31'], dtype='datetime64[ns]', freq='A-DEC')


#  星期一
pd.date_range("2017-06-01","2017-08-01",freq="W-MON")


    DatetimeIndex(['2017-06-05', '2017-06-12', '2017-06-19', '2017-06-26',
                   '2017-07-03', '2017-07-10', '2017-07-17', '2017-07-24',
                   '2017-07-31'],
                  dtype='datetime64[ns]', freq='W-MON')

periods 指定時間長度

#  從2017-06-01開始,產生20天

pd.date_range("2017-06-01",periods=20)
    DatetimeIndex(['2017-06-01', '2017-06-02', '2017-06-03', '2017-06-04',
                   '2017-06-05', '2017-06-06', '2017-06-07', '2017-06-08',
                   '2017-06-09', '2017-06-10', '2017-06-11', '2017-06-12',
                   '2017-06-13', '2017-06-14', '2017-06-15', '2017-06-16',
                   '2017-06-17', '2017-06-18', '2017-06-19', '2017-06-20'],
                  dtype='datetime64[ns]', freq='D')
#  從2017-06-01開始,產生20個周

pd.date_range("2017-06-01",periods=20,freq="W")
    DatetimeIndex(['2017-06-04', '2017-06-11', '2017-06-18', '2017-06-25',
                   '2017-07-02', '2017-07-09', '2017-07-16', '2017-07-23',
                   '2017-07-30', '2017-08-06', '2017-08-13', '2017-08-20',
                   '2017-08-27', '2017-09-03', '2017-09-10', '2017-09-17',
                   '2017-09-24', '2017-10-01', '2017-10-08', '2017-10-15'],
                  dtype='datetime64[ns]', freq='W-SUN')
df=pd.read_csv("d:/601318.csv",index_col="date",parse_dates=["date"])
df

2470 rows × 7 columns

type(df.index)


pandas.core.indexes.datetimes.DatetimeIndex

可以看到df.index的類型就是pd.date_range之后的類型:DatetimeIndex DatetimeIndex這個類型可以在查找時非常方便

#  查找 2017年的數據

df["2017"]

141 rows × 7 columns

#  查找 2017年8月的數據

df["2017-8"]

#  查找 2017年6月到9月的數據

df["2017-06":"2017-09"]

這里是按照時間對象索引(類似於標簽索引),顧前也顧尾

df[:10]

七、測驗

求出股票行情的前5日和前10日的平均值(這里是close列的平均值)

import numpy as np
import pandas as pd
df=pd.read_csv("d:/ceshi.csv",index_col="date",parse_dates=["date"])
df

2470 rows × 7 columns

方案1:手動計算

# 思路:拿出每一行前5行的"close"列的數據,再mean()求出平均值,賦值給列"ma5"
df2=df[:10].copy()
df2.loc["2007-03-07","ma5"]=df2["close"][:6].mean()
df2.loc["2007-03"]
# 創建兩列,並初始化為nan

df["ma5"]=np.nan
df["ma10"]=np.nan
df

2470 rows × 9 columns

#  使用for循環一個一個的去賦值

for i in range(4,len(df)):
    df.loc[df.index[i],"ma5"]=df["close"][i-4:i+1].mean()

for i in range(9,len(df)):
    df.loc[df.index[i],"ma10"]=df["close"][i-9:i+1].mean()

df

2470 rows × 9 columns


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM