python之prettytable


 

sdata={'語文':89,'數學':96,'音樂':39,'英語':78,'化學':88}
#字典向Series轉化
>>> studata=Series(sdata)
>>> studata
化學    88
數學    96
英語    78
語文    89
音樂    39
dtype: int64
>>> obj=Series(sdata,index=['物理','數學','化學'])
>>> obj
物理     NaN[這個地方沒有物理成績所以是NaN,同時引起下面的數據是float型]
數學    96.0
化學    88.0
dtype: float64
#判斷數據行中是否為空值
>>> pd.isnull(obj)
物理     True
數學    False
化學    False
dtype: bool
>>> pd.notnull(obj)
物理    False
數學     True
化學     True
dtype: bool
>>> obj.isnull()
物理     True
數學    False
化學    False
dtype: bool

#對應數據相加
>>> en=Series([84,94,51,81],index=['張三','李四','王五','趙六'])
>>> sx=Series([94,81,31,91],index=['張三','趙六','王五','李四'])
>>> en+sx   [相加時候索引自動對其]
張三    178
李四    185
王五     82
趙六    162
dtype: int64

#Series 的name 屬性
>>> en.name='英語成績'
>>> en
張三    84
李四    94
王五    51
趙六    81
Name: 英語成績, dtype: int64
>>> en.index.name='姓名'
>>> en
姓名
張三    84
李四    94
王五    51
趙六    81
Name: 英語成績, dtype: int64

#索引是可以修改的
>>> en.index=['zs','ll','ww','zl']
>>> en
zs    84
ll    94
ww    51
zl    81
Name: 英語成績, dtype: int64

#############DataFrame##############

>>> data={
	'name':['張三','張三','張三','李四','李四','李四'],
	'year':[2001,2002,2003,2001,2002,2003],
	'weight':[54,50,60,61,63,65],
}
>>> frame=DataFrame(data)
>>> frame
  name  weight  year
0   張三      54  2001
1   張三      50  2002
2   張三      60  2003
3   李四      61  2001
4   李四      63  2002
5   李四      65  2003

#columns可以修改顯示順序和選項
>>> DataFrame(data,columns=['year','weight','name'])
   year  weight name
0  2001      54   張三
1  2002      50   張三
2  2003      60   張三
3  2001      61   李四
4  2002      63   李四
5  2003      65   李四

>>> DataFrame(data,columns=['year','weight','name','sex'],index=['one','two','three','four','five','five'])
       year  weight name  sex
one    2001      54   張三  NaN
two    2002      50   張三  NaN
three  2003      60   張三  NaN
four   2001      61   李四  NaN
five   2002      63   李四  NaN
five   2003      65   李四  NaN

#索引相同的情況查詢,獲取某一行或者幾行
>>> a.ix['five']
      year  weight name  sex
five  2002      63   李四  NaN
five  2003      65   李四  NaN

#DataFrame-->Series 降維
#獲取某一列
>>> info=DataFrame(data,columns=['year','weight','name','sex'],index=['one','two','three','four','five','five'])
>>> info['name']                                                                one      張三
two      張三
three    張三
four     李四
five     李四
five     李四
Name: name, dtype: object

#列賦值
>>> info['sex']='男'
>>> info
       year  weight name sex
one    2001      54   張三   男
two    2002      50   張三   男
three  2003      60   張三   男
four   2001      61   李四   男
five   2002      63   李四   男
five   2003      65   李四   男

#列賦值-列值局部賦值
>>> val=Series(['man','woman','man'],index=['two','four','five'])
>>> info['sex']=val
>>> info
       year  weight name    sex
one    2001      54   張三    NaN
two    2002      50   張三    man
three  2003      60   張三    NaN
four   2001      61   李四  woman
five   2002      63   李四    man
five   2003      65   李四    man

#為不存在的列創建並賦值
>>> info['sexflag']=info.sex=='man'
>>> info
       year  weight name    sex sexflag
one    2001      54   張三    NaN   False
two    2002      50   張三    man    True
three  2003      60   張三    NaN   False
four   2001      61   李四  woman   False
five   2002      63   李四    man    True
five   2003      65   李四    man    True

#刪除某一個列
>>> del info['sex']
>>> info
       year  weight name sexflag
one    2001      54   張三   False
two    2002      50   張三    True
three  2003      60   張三   False
four   2001      61   李四   False
five   2002      63   李四    True
five   2003      65   李四    True

#嵌套字典-----convert--->DataFrame
#外層的key是列;內層的key是行
>>> studata={'張三':{'語文':91,'數學':99,'物理':90},'李四':{'語文':31,'數學':65,'物理':45}}
>>> info2=DataFrame(studata)
>>> info2
    張三  李四
數學  99  65
物理  90  45
語文  91  31
>>> info2.T
    數學  物理  語文
張三  99  90  91
李四  65  45  31

#index.name columns.name 屬性
>>> info
       year  weight name sexflag
one    2001      54   張三   False
two    2002      50   張三    True
three  2003      60   張三   False
four   2001      61   李四   False
five   2002      63   李四    True
five   2003      65   李四    True
>>> info.index.name='個人信息'
>>> info.columns.name='索引'
>>> info
索引 year  weight name sexflag
個人信息                            
one    2001      54   張三   False
two    2002      50   張三    True
three  2003      60   張三   False
four   2001      61   李四   False
five   2002      63   李四    True
five   2003      65   李四    True

>>> info.index
Index([u'one', u'two', u'three', u'four', u'five', u'five'], dtype='object', name=u'個人信息')
#集合去重復
>>> info.index.unique
<bound method Index.unique of Index([u'one', u'two', u'three', u'four', u'five', u'five'], dtype='object', name=u'個人信息')>
>>> info.index.unique()
array(['one', 'two', 'three', 'four', 'five'], dtype=object)
#是否唯一
>>> info.index.is_unique
False
#當各元素均大於等於前一個元素時候,返回True
>>> DataFrame(range(1,4),index=range(1,4)).index.is_monotonic
True
>>> info.index.is_monotonic
False
#刪除傳入的值並得到新的index
>>> DataFrame(range(1,4),index=range(1,4)).index.drop(1)
Int64Index([2, 3], dtype='int64')

>>> obj=Series([33,23],index=['a','b'])
>>> obj
a    33
b    23
dtype: int64
>>> obj2=obj.reindex(['b','a','c'])
>>> obj2
b    23.0
a    33.0
c     NaN
dtype: float64
>>> obj2=obj.reindex(['b','a','c'],fill_value=0)
>>> obj2
b    23
a    33
c     0
dtype: int64

>>> obj3=Series(['blue','purple','yellow'],index=[0,2,4])
>>> obj3
0      blue
2    purple
4    yellow
dtype: object
#ffill前向值填充
>>> obj3.reindex(range(6),method='ffill')
0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object
#bfill后向填充
>>> obj3.reindex(range(6),method='bfill')
0      blue
1    purple
2    purple
3    yellow
4    yellow
5       NaN
dtype: object

>>> frame=DataFrame(np.arange(9).reshape((3,3)),index=['a','b','d'],columns=['Ohio','Texas','california'])
>>> frame
   Ohio  Texas  california
a     0      1           2
b     3      4           5
d     6      7           8
#重新索引行
>>> frame2=frame.reindex(['a','b','c','d'])
>>> frame2
   Ohio  Texas  california
a   0.0    1.0         2.0
b   3.0    4.0         5.0
c   NaN    NaN         NaN
d   6.0    7.0         8.0
#重新索引列
>>> cols=['Texas','Ohio','uknown']
>>> frame.reindex(columns=cols)
   Texas  Ohio  uknown
a      1     0     NaN
b      4     3     NaN
d      7     6     NaN

>>> frame.reindex(index=['a','b','c','d'],method='ffill',columns=cols)
   Texas  Ohio  uknown
a      1     0     NaN
b      4     3     NaN
c      4     3     NaN
d      7     6     NaN
>>> frame.ix[['a','b','c','d'],cols]
   Texas  Ohio  uknown
a    1.0   0.0     NaN
b    4.0   3.0     NaN
c    NaN   NaN     NaN
d    7.0   6.0     NaN


>>> data
   Texas  Ohio  uknown
a    1.0   0.0     NaN
b    4.0   3.0     NaN
c    NaN   NaN     NaN
d    7.0   6.0     NaN
#刪除行
>>> data.drop(['c','b'])
   Texas  Ohio  uknown
a    1.0   0.0     NaN
d    7.0   6.0     NaN
>>> data.drop('uknown',axis=1)
   Texas  Ohio
a    1.0   0.0
b    4.0   3.0
c    NaN   NaN
d    7.0   6.0

#列的條件查詢
>>> info[info['weight']>60]
索引 year  weight name sexflag
個人信息                           
four  2001      61   李四   False
five  2002      63   李四    True
five  2003      65   李四    True

#
>>> info.ix['one',['name','year']]
索引
name      張三
year    2001
Name: one, dtype: object


>>> data=DataFrame(np.arange(16).reshape((4,4)),index=['Ohio','Colorado','Utah','NewYork'],columns=['one','two','three','four'])
>>> data
          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
NewYork    12   13     14    15
>>> data['two']
Ohio         1
Colorado     5
Utah         9
NewYork     13
Name: two, dtype: int64
>>> data[['three','one']]
          three  one
Ohio          2    0
Colorado      6    4
Utah         10    8
NewYork      14   12
>>> 
>>> data[:2]
          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
>>> data[data['three']>5]
          one  two  three  four
Colorado    4    5      6     7
Utah        8    9     10    11
NewYork    12   13     14    15

>>> data<5
            one    two  three   four
Ohio       True   True   True   True
Colorado   True  False  False  False
Utah      False  False  False  False
NewYork   False  False  False  False
>>> data[data<5]=0
>>> data
          one  two  three  four
Ohio        0    0      0     0
Colorado    0    5      6     7
Utah        8    9     10    11
NewYork    12   13     14    15

#行列組合查詢
>>> data.ix['Colorado',['two','three']]
two      5
three    6
Name: Colorado, dtype: int64
>>> data.ix[['Colorado','Utah'],[3,0,1]]
          four  one  two
Colorado     7    0    5
Utah        11    8    9

>>> data.ix[:'Utah','two']
Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int64
>>> 

>>> data.ix[data.three>5,:3]
          one  two  three
Colorado    0    5      6
Utah        8    9     10
NewYork    12   13     14

#obj[val] 選取DataFrame的單個列或一組列。在一些特殊情況下會比較便利
#obj.ix[val] 選取DataFrame的單個行或一組行
#obj.ix[:,val] 選取單個列或列子集
#obj.ix[val1,val2] 同時選取行和列
#reindex 新索引

#DataFrame的數據對齊
>>> df1 = DataFrame(np.arange(9.).reshape((3,3)),columns=list('bcd'),index=['good','bad','normal'])
>>> df2 = DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),index=['good','normal','bad','supper'])
>>> df1
          b    c    d
good    0.0  1.0  2.0
bad     3.0  4.0  5.0
normal  6.0  7.0  8.0
>>> df2
          b     d     e
good    0.0   1.0   2.0
normal  3.0   4.0   5.0
bad     6.0   7.0   8.0
supper  9.0  10.0  11.0

>>> df1+df2
          b   c     d   e
bad     9.0 NaN  12.0 NaN
good    0.0 NaN   3.0 NaN
normal  9.0 NaN  12.0 NaN
supper  NaN NaN   NaN NaN

#沒有的值使用0填充
>>> df1.add(df2,fill_value=0)
          b    c     d     e
bad     9.0  4.0  12.0   8.0
good    0.0  1.0   3.0   2.0
normal  9.0  7.0  12.0   5.0
supper  9.0  NaN  10.0  11.0
#索引reindex 的填充
>>> df1.reindex(columns=df2.columns,fill_value=0)
          b    d  e
good    0.0  2.0  0
bad     3.0  5.0  0
normal  6.0  8.0  0

#其他的算術方法:
add +
sub -
div /
mul *


DataFrame和Series的運算
>>> frame=DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),index=['good','bad','supper','uknown'])
>>> frame
          b     d     e
good    0.0   1.0   2.0
bad     3.0   4.0   5.0
supper  6.0   7.0   8.0
uknown  9.0  10.0  11.0
>>> series=frame.ix[0]
>>> series
b    0.0
d    1.0
e    2.0
Name: good, dtype: float64
>>> 
>>> frame-series
          b    d    e
good    0.0  0.0  0.0
bad     3.0  3.0  3.0
supper  6.0  6.0  6.0
uknown  9.0  9.0  9.0

#frame 和 serie運算出現廣播現象
>>> series2=Series(range(3),index=[list('bef')])
>>> series2
b    0
e    1
f    2
dtype: int64
>>> frame+series2
          b   d     e   f
good    0.0 NaN   3.0 NaN
bad     3.0 NaN   6.0 NaN
supper  6.0 NaN   9.0 NaN
uknown  9.0 NaN  12.0 NaN

#在列上廣播
>>> frame.sub(series3,axis=0)
          b    d    e
good   -1.0  0.0  1.0
bad    -1.0  0.0  1.0
supper -1.0  0.0  1.0
uknown -1.0  0.0  1.0


>>> frame=DataFrame(np.random.randn(4,3),columns=list('bde'),index=['good','bad','nice','supper'])
>>> frame
               b         d         e
good    0.428420 -0.951975  0.862226
bad    -0.666254 -0.988423  2.442255
nice    1.617591  0.377867 -1.069077
supper -1.417150  0.449853  0.685007
#全部轉換成正數
>>> np.abs(frame)
               b         d         e
good    0.428420  0.951975  0.862226
bad     0.666254  0.988423  2.442255
nice    1.617591  0.377867  1.069077
supper  1.417150  0.449853  0.685007

>>> f=lambda x: x.max()-x.min()
>>> frame.apply(f,axis=0)
b    3.034740
d    1.438276
e    3.511332
dtype: float64
>>> frame.apply(f,axis=1)
good      1.814201
bad       3.430677
nice      2.686668
supper    2.102157
dtype: float64

>>> def f(x):return Series([x.min(),x.max()],index=['min','max'])
... 
>>> frame.apply(f)
            b         d         e
min -1.417150 -0.988423 -1.069077
max  1.617591  0.449853  2.442255

#格式化內容
>>> format=lambda x:'%.2f' % x
>>> frame.applymap(format)
            b      d      e
good     0.43  -0.95   0.86
bad     -0.67  -0.99   2.44
nice     1.62   0.38  -1.07
supper  -1.42   0.45   0.69



#############排序和排名#############
#ascending 升序還是降
>>> frame=DataFrame(np.arange(8).reshape((2,4)),index=['three','one'],columns=[list('nalv')])
>>> frame
       n  a  l  v
three  0  1  2  3
one    4  5  6  7
>>> frame.sort_index()
       n  a  l  v
one    4  5  6  7
three  0  1  2  3
>>> frame.sort_index(axis=1)
       a  l  n  v
three  1  2  0  3
one    5  6  4  7
>>> frame.sort_index(axis=1,ascending=False)
       v  n  l  a
three  3  0  2  1
one    7  4  6  5

>>> obj=Series([4,5,-3,2])
>>> obj.order()
2   -3
3    2
0    4
1    5
dtype: int64

#指定列v倒敘排
>>> frame.sort_index(axis=0,ascending=False,by='v')
       n  a  l  v
one    4  5  6  7
three  0  1  2  3

>>> frame.sort_index(axis=0,ascending=False,by=['v','l'])
       n  a  l  v
one    4  5  6  7
three  0  1  2  3

>>> obj=Series([7,-5,7,4,2,0,4])
>>> obj.rank(method='first')
0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64
>>> obj.rank(ascending=False,method='max')
0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64


>>> DataFrame(studata).T
    數學  物理  語文
張三  99  90  91
李四  65  45  31
>>> DataFrame(studata).T.rank(axis=1,ascending=False)
     數學   物理   語文
張三  1.0  3.0  2.0
李四  1.0  2.0  3.0
>>> DataFrame(studata).T.rank(axis=0,ascending=False)
     數學   物理   語文
張三  1.0  1.0  1.0
李四  2.0  2.0  2.0


>>> datastu=pd.read_csv('/Users/similarface/Downloads/jnn.csv')
>>> datastu
           准考證號   姓名  班級     語文  數學     英語  化學  物理
0  304040250124   羅茜   1  101.0  94  102.5  79  74
1  304040250128  沈怡君   1   91.5  96   69.0  82  69
2  304040250321   魏華   2   74.0  28   42.0  56  56
3  304040250233  何仕林   2   60.5  42   34.5  49  46
4  304040250725   屈妮   5   93.5  63   77.5  55  66
5  304040250709  鄧培蓓   5  102.5  81   47.0  65  58
6  304040250805  鄭清霞   5   89.0  80   63.5  63  65
7  304040250827   明楊   6  108.5  92   79.0  89  83
8  304040250819   李倩   6   93.5  61   44.0  45  32
9  304040250912  江明悅   6    0.0   0    0.0   0   0

>>> datastu.rank(axis=1,ascending=False,method='min')
   准考證號   姓名   班級   語文   數學   英語   化學   物理
0   2.0  1.0  8.0  4.0  5.0  3.0  6.0  7.0
1   2.0  1.0  8.0  4.0  3.0  6.0  5.0  6.0
2   2.0  1.0  8.0  3.0  7.0  6.0  4.0  4.0
3   2.0  1.0  8.0  3.0  6.0  7.0  4.0  5.0
4   2.0  1.0  8.0  3.0  6.0  4.0  7.0  5.0
5   2.0  1.0  8.0  3.0  4.0  7.0  5.0  6.0
6   2.0  1.0  8.0  3.0  4.0  6.0  7.0  5.0
7   2.0  1.0  8.0  3.0  4.0  7.0  5.0  6.0
8   2.0  1.0  8.0  3.0  4.0  6.0  5.0  7.0
9   2.0  1.0  3.0  4.0  4.0  4.0  4.0  4.0
>>> datastu.rank(axis=0,ascending=False,method='min')
   准考證號    姓名   班級    語文    數學    英語    化學    物理
0  10.0   4.0  9.0   3.0   2.0   1.0   3.0   2.0
1   9.0   5.0  9.0   6.0   1.0   4.0   2.0   3.0
2   7.0   1.0  7.0   8.0   9.0   8.0   6.0   7.0
3   8.0  10.0  7.0   9.0   8.0   9.0   8.0   8.0
4   5.0   9.0  4.0   4.0   6.0   3.0   7.0   4.0
5   6.0   3.0  4.0   2.0   4.0   6.0   4.0   6.0
6   4.0   2.0  4.0   7.0   5.0   5.0   5.0   5.0
7   2.0   8.0  1.0   1.0   3.0   2.0   1.0   1.0
8   3.0   7.0  1.0   4.0   7.0   7.0   9.0   9.0
9   1.0   6.0  1.0  10.0  10.0  10.0  10.0  10.0

>>> data=datastu[['語文','數學','物理','英語','化學']]
>>> data
      語文  數學  物理     英語  化學
0  101.0  94  74  102.5  79
1   91.5  96  69   69.0  82
2   74.0  28  56   42.0  56
3   60.5  42  46   34.5  49
4   93.5  63  66   77.5  55
5  102.5  81  58   47.0  65
6   89.0  80  65   63.5  63
7  108.5  92  83   79.0  89
8   93.5  61  32   44.0  45
9    0.0   0   0    0.0   0


>>> data.sum()
語文    814.0
數學    637.0
物理    549.0
英語    559.0
化學    583.0
dtype: float64

>>> data.sum(axis=1)
0    450.5
1    407.5
2    256.0
3    232.0
4    355.0
5    353.5
6    360.5
7    451.5
8    275.5
9      0.0
dtype: float64

#axis
#skipna 排除缺失值NAN
#level 


>>> data
      語文  數學  物理     英語  化學
0  101.0  94  74  102.5  79
1   91.5  96  69   69.0  82
2   74.0  28  56   42.0  56
3   60.5  42  46   34.5  49
4   93.5  63  66   77.5  55
5  102.5  81  58   47.0  65
6   89.0  80  65   63.5  63
7  108.5  92  83   79.0  89
8   93.5  61  32   44.0  45
9    0.0   0   0    0.0   0
#返回間接統計
>>> data.idxmax()
語文    7   最高分數的索引在7
數學    1   最高分數的索引在1
物理    7	最高分數的索引在7
英語    0	最高分數的索引在0
化學    7	最高分數的索引在7
dtype: int64
#累和
>>> data.cumsum()
      語文     數學     物理     英語     化學
0  101.0   94.0   74.0  102.5   79.0
1  192.5  190.0  143.0  171.5  161.0
2  266.5  218.0  199.0  213.5  217.0
3  327.0  260.0  245.0  248.0  266.0
4  420.5  323.0  311.0  325.5  321.0
5  523.0  404.0  369.0  372.5  386.0
6  612.0  484.0  434.0  436.0  449.0
7  720.5  576.0  517.0  515.0  538.0
8  814.0  637.0  549.0  559.0  583.0
9  814.0  637.0  549.0  559.0  583.0

>>> data.describe()
               語文        數學         物理          英語         化學
count   10.000000  10.00000  10.000000   10.000000  10.000000
mean    81.400000  63.70000  54.900000   55.900000  58.300000
std     31.857146  31.86447  24.052951   28.670349  25.117723
min      0.000000   0.00000   0.000000    0.000000   0.000000
25%     77.750000  46.75000  48.500000   42.500000  50.500000
50%     92.500000  71.50000  61.500000   55.250000  59.500000
75%     99.125000  89.25000  68.250000   75.375000  75.500000
max    108.500000  96.00000  83.000000  102.500000  89.000000

'''

DataFrame.abs()	Return an object with absolute value taken–only applicable to objects that are all numeric.
DataFrame.all([axis, bool_only, skipna, level])	Return whether all elements are True over requested axis
DataFrame.any([axis, bool_only, skipna, level])	Return whether any element is True over requested axis
DataFrame.clip([lower, upper, out, axis])	Trim values at input threshold(s).
DataFrame.clip_lower(threshold[, axis])	Return copy of the input with values below given value(s) truncated.
DataFrame.clip_upper(threshold[, axis])	Return copy of input with values above given value(s) truncated.
DataFrame.corr([method, min_periods])	Compute pairwise correlation of columns, excluding NA/null values
DataFrame.corrwith(other[, axis, drop])	Compute pairwise correlation between rows or columns of two DataFrame objects.
DataFrame.count([axis, level, numeric_only])	Return Series with number of non-NA/null observations over requested axis.
DataFrame.cov([min_periods])	Compute pairwise covariance of columns, excluding NA/null values
DataFrame.cummax([axis, dtype, out, skipna])	Return cumulative max over requested axis.
DataFrame.cummin([axis, dtype, out, skipna])	Return cumulative min over requested axis.
DataFrame.cumprod([axis, dtype, out, skipna])	Return cumulative prod over requested axis.
DataFrame.cumsum([axis, dtype, out, skipna])	Return cumulative sum over requested axis.
DataFrame.describe([percentiles, include, ...])	Generate various summary statistics, excluding NaN values.
一階差分(時間序列很有用)DataFrame.diff([periods, axis])	1st discrete difference of object
DataFrame.eval(expr[, inplace])	Evaluate an expression in the context of the calling DataFrame instance.
樣本的峰度(四階矩)DataFrame.kurt([axis, skipna, level, ...])	Return unbiased kurtosis over requested axis using Fisher’s definition of kurtosis (kurtosis of normal == 0.0).
平均絕對離差DataFrame.mad([axis, skipna, level])	Return the mean absolute deviation of the values for the requested axis
DataFrame.max([axis, skipna, level, ...])	This method returns the maximum of the values in the object.
DataFrame.mean([axis, skipna, level, ...])	Return the mean of the values for the requested axis
DataFrame.median([axis, skipna, level, ...])	Return the median of the values for the requested axis
DataFrame.min([axis, skipna, level, ...])	This method returns the minimum of the values in the object.
DataFrame.mode([axis, numeric_only])	Gets the mode(s) of each element along the axis selected.
百分數變化DataFrame.pct_change([periods, fill_method, ...])	Percent change over given number of periods.
DataFrame.prod([axis, skipna, level, ...])	Return the product of the values for the requested axis
DataFrame.quantile([q, axis, numeric_only, ...])	Return values at the given quantile over requested axis, a la numpy.percentile.
DataFrame.rank([axis, method, numeric_only, ...])	Compute numerical data ranks (1 through n) along axis.
DataFrame.round([decimals, out])	Round a DataFrame to a variable number of decimal places.
DataFrame.sem([axis, skipna, level, ddof, ...])	Return unbiased standard error of the mean over requested axis.
樣本值的偏度(三階矩)DataFrame.skew([axis, skipna, level, ...])	Return unbiased skew over requested axis
DataFrame.sum([axis, skipna, level, ...])	Return the sum of the values for the requested axis
標准差DataFrame.std([axis, skipna, level, ddof, ...])	Return sample standard deviation over requested axis.
方差DataFrame.var([axis, skipna, level, ddof, ...])	Return unbiased variance over requested axis.
'''


>>> import pandas.io.data as web
>>> all_data={}
>>> for ticker in ['AAPL','IBM','MSFT','GOOG']: all_data[ticker]=web.get_data_yahoo(ticker,'1/1/2000','1/1/2010')
>>> price=DataFrame({tic:data['Adj Close'] for tic ,data in all_data.iteritems()})
>>> volume=DataFrame({tic:data['Volume'] for tic,data in all_data.iteritems()})
>>> returns=price.pct_change()
>>> returns.tail()
                AAPL      GOOG       IBM      MSFT
Date                                              
2009-12-24  0.034339  0.011117  0.004385  0.002587
2009-12-28  0.012294  0.007098  0.013326  0.005484
2009-12-29 -0.011861 -0.005571 -0.003477  0.007058
2009-12-30  0.012147  0.005376  0.005461 -0.013699
2009-12-31 -0.004300 -0.004416 -0.012597 -0.015504
#計算相關系數
>>> returns.IBM.corr(returns.GOOG)
0.39068882087254675
>>> returns.corrwith(returns.IBM)
AAPL    0.410011
GOOG    0.390689
IBM     1.000000
MSFT    0.495980
dtype: float64

>>> returns.corrwith(volume)
AAPL   -0.057549
GOOG    0.062647
IBM    -0.007892
MSFT   -0.014245
dtype: float64

>>> obj=Series(['c','b','c','c','d','a','g','b'])
>>> obj.value_counts()
c    3
b    2
g    1
d    1
a    1
dtype: int64
>>> pd.value_counts(obj.values,sort=False)
a    1
c    3
b    2
d    1
g    1
dtype: int64

#是否存在
>>> mask=obj.isin(['b','c'])
>>> mask
0     True
1     True
2     True
3     True
4    False
5    False
6    False
7     True
dtype: bool

>>> obj[mask]
0    c
1    b
2    c
3    c
7    b
dtype: object

#頻度柱狀圖
>>> data=DataFrame({'Qu1':[1,3,4,5,3],'Qu2':[2,4,1,2,4],'Qu3':[3,4,2,1,1]})
>>> data
   Qu1  Qu2  Qu3
0    1    2    3
1    3    4    4
2    4    1    2
3    5    2    1
4    3    4    1
>>> data.apply(pd.value_counts).fillna(0)
   Qu1  Qu2  Qu3
1  1.0  1.0  2.0
2  0.0  2.0  1.0
3  2.0  0.0  1.0
4  1.0  2.0  1.0
5  1.0  0.0  0.0



#缺失數據處理
>>> string_data=Series(['張三','李四',np.nan,'趙六'])
>>> string_data
0     張三
1     李四
2    NaN
3     趙六
dtype: object
>>> string_data.isnull()
0    False
1    False
2     True
3    False
dtype: bool

######過濾數據過濾缺失數據
>>> from numpy import nan as NA
>>> data=Series([1,NA,3.5,NA,7])
>>> data.dropna()
0    1.0
2    3.5
4    7.0
dtype: float64
>>> data
0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64
>>> data[data.notnull()]
0    1.0
2    3.5
4    7.0
dtype: float64

#DataFrame默認刪除只要包含NA的行
>>> data=DataFrame([[1.,6.5,3.],[1,NA,NA],[NA,NA,NA],[NA,6.5,3.]])
>>> data
     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0
>>> data.dropna()
     0    1    2
0  1.0  6.5  3.0
#how='all'
>>> data.dropna(how='all')
     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
3  NaN  6.5  3.0

#刪除列全是null的
>>> data
     0    1    2   4
0  1.0  6.5  3.0 NaN
1  1.0  NaN  NaN NaN
2  NaN  NaN  NaN NaN
3  NaN  6.5  3.0 NaN
>>> data.dropna(axis=1,how='all')
     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0

#thresh  表示空值的個數
>>> df.dropna(thresh=3)
          0         1         2
5  0.519277  1.182077 -0.500918
6 -0.050867 -0.051302  1.368309

#填充缺失數據
>>> df.fillna(-1)
          0         1         2
0  0.581403 -1.000000 -1.000000
1 -1.709160 -1.000000 -1.000000
2  2.496074 -1.000000 -1.000000
3  0.329339 -1.000000  0.736299
4 -0.638106 -1.000000  0.756044
5  0.519277  1.182077 -0.500918
6 -0.050867 -0.051302  1.368309
#指定列的填充
>>> df.fillna({1:0.5,3:-1})
          0         1         2
0  0.581403  0.500000       NaN
1 -1.709160  0.500000       NaN
2  2.496074  0.500000       NaN
3  0.329339  0.500000  0.736299
4 -0.638106  0.500000  0.756044
5  0.519277  1.182077 -0.500918
6 -0.050867 -0.051302  1.368309

#修改原始對象 默認返回新對象
>>> df.fillna({1:0.5,3:-1},inplace=True)
          0         1         2
0  0.581403  0.500000       NaN
1 -1.709160  0.500000       NaN
2  2.496074  0.500000       NaN
3  0.329339  0.500000  0.736299
4 -0.638106  0.500000  0.756044
5  0.519277  1.182077 -0.500918
6 -0.050867 -0.051302  1.368309
>>> df
          0         1         2
0  0.581403  0.500000       NaN
1 -1.709160  0.500000       NaN
2  2.496074  0.500000       NaN
3  0.329339  0.500000  0.736299
4 -0.638106  0.500000  0.756044
5  0.519277  1.182077 -0.500918
6 -0.050867 -0.051302  1.368309

>>> info=DataFrame(np.random.randn(6,3))
>>> info.ix[:2,1]=NA;info.ix[4:,2]=NA
>>> info
          0         1         2
0  1.217480       NaN  0.479981
1 -2.104463       NaN -2.917539
2 -2.141440       NaN -1.371574
3  0.925971  1.697813  0.814347
4 -1.463290 -0.526497       NaN
5 -0.300475  0.839098       NaN
#可以限制行數
>>> info.fillna(method='bfill',limit=1)
          0         1         2
0  1.217480       NaN  0.479981
1 -2.104463       NaN -2.917539
2 -2.141440  1.697813 -1.371574
3  0.925971  1.697813  0.814347
4 -1.463290 -0.526497       NaN
5 -0.300475  0.839098       NaN

#層次索引
>>> data=Series(np.random.randn(10),index=[['a','a','a','b','b','b','c','c','d','d'],[1,2,3,1,2,3,1,2,2,3]])
>>> data
a  1    1.148945
   2   -0.489120
   3    1.151546
b  1    0.840938
   2   -1.992375
   3    0.039002
c  1    2.157531
   2    0.963063
d  2    0.130796
   3    0.012320
dtype: float64
>>> data.index
MultiIndex(levels=[[u'a', u'b', u'c', u'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])
>>> data['b']
1    0.840938
2   -1.992375
3    0.039002
dtype: float64
>>> data['b':'c']
b  1    0.840938
   2   -1.992375
   3    0.039002
c  1    2.157531
   2    0.963063
dtype: float64
>>> data.ix[['b','d']]
b  1    0.840938
   2   -1.992375
   3    0.039002
d  2    0.130796
   3    0.012320
dtype: float64

>>> data[:,2]
a   -0.489120
b   -1.992375
c    0.963063
d    0.130796
dtype: float64

#轉換成dataframe
>>> data.unstack()
          1         2         3
a  1.148945 -0.489120  1.151546
b  0.840938 -1.992375  0.039002
c  2.157531  0.963063       NaN
d       NaN  0.130796  0.012320

>>> data.unstack().stack()
a  1    1.148945
   2   -0.489120
   3    1.151546
b  1    0.840938
   2   -1.992375
   3    0.039002
c  1    2.157531
   2    0.963063
d  2    0.130796
   3    0.012320
dtype: float64

>>> frame=DataFrame(np.arange(12).reshape((4,3)),index=[['a','a','b','b'],[1,2,1,2]],columns=[['good','good','bad'],['G','R','G']])
>>> frame
    good     bad
       G   R   G
a 1    0   1   2
  2    3   4   5
b 1    6   7   8
  2    9  10  11

>>> frame.index.names=['key1','key2']
>>> frame.columns.names=['s','c']
>>> frame
s         good     bad
c            G   R   G
key1 key2             
a    1       0   1   2
     2       3   4   5
b    1       6   7   8
     2       9  10  11

>>> frame['good']
c          G   R
key1 key2       
a    1     0   1
     2     3   4
b    1     6   7
     2     9  10

#重排分級順序
>>> frame.swaplevel('key1','key2')
          good     bad
             G   R   G
key2 key1             
1    a       0   1   2
2    a       3   4   5
1    b       6   7   8
2    b       9  10  11
>>> frame.sortlevel(1)
state     good     bad
color        G   R   G
key1 key2             
a    1       0   1   2
b    1       6   7   8
a    2       3   4   5
b    2       9  10  11
>>> frame.swaplevel(0,1).sortlevel(0)
state     good     bad
color        G   R   G
key2 key1             
1    a       0   1   2
     b       6   7   8
2    a       3   4   5
     b       9  10  11
#根據層次匯總
>>> frame.sum(level='key2')
state good     bad
color    G   R   G
key2              
1        6   8  10
2       12  14  16
>>> frame.sum(level='color',axis=1)
color       G   R
key1 key2        
a    1      2   1
     2      8   4
b    1     14   7
     2     20  10

#使用DataFrame的列
>>> frame=DataFrame({'a':range(7),'b':range(7,0,-1),'c':['one','one','one','two','two','two','two'],'d':[0,1,2,0,1,2,3]})
>>> frame
   a  b    c  d
0  0  7  one  0
1  1  6  one  1
2  2  5  one  2
3  3  4  two  0
4  4  3  two  1
5  5  2  two  2
6  6  1  two  3
>>> frame2=frame.set_index(['c','d'])
>>> frame2
       a  b
c   d      
one 0  0  7
    1  1  6
    2  2  5
two 0  3  4
    1  4  3
    2  5  2
    3  6  1
>>> frame2=frame.set_index(['c','d'],drop=False)
>>> frame2
       a  b    c  d
c   d              
one 0  0  7  one  0
    1  1  6  one  1
    2  2  5  one  2
two 0  3  4  two  0
    1  4  3  two  1
    2  5  2  two  2
    3  6  1  two  3
 
##############讀取文件################
>>> os.system('cat  /Users/similarface/Downloads/jnn.csv')
准考證號,姓名,班級,語文,數學,英語,化學,物理
304040250124,羅茜,1,101,94,102.5,79,74
304040250128,沈怡君,1,91.5,96,69,82,69
304040250321,魏華,2,74,28,42,56,56
304040250233,何仕林,2,60.5,42,34.5,49,46
304040250725,屈妮,5,93.5,63,77.5,55,66
304040250709,鄧培蓓,5,102.5,81,47,65,58
304040250805,鄭清霞,5,89,80,63.5,63,65
304040250827,明楊,6,108.5,92,79,89,83
304040250819,李倩,6,93.5,61,44,45,32
304040250912,江明悅,6,0,0,0,0,00
>>> pd.read_csv('/Users/similarface/Downloads/jnn.csv',name>>> pd.read_csv('/Users/similarface/Downloads/jnn.csv')
           准考證號   姓名  班級     語文  數學     英語  化學  物理
0  304040250124   羅茜   1  101.0  94  102.5  79  74
1  304040250128  沈怡君   1   91.5  96   69.0  82  69
2  304040250321   魏華   2   74.0  28   42.0  56  56
3  304040250233  何仕林   2   60.5  42   34.5  49  46
4  304040250725   屈妮   5   93.5  63   77.5  55  66
5  304040250709  鄧培蓓   5  102.5  81   47.0  65  58
6  304040250805  鄭清霞   5   89.0  80   63.5  63  65
7  304040250827   明楊   6  108.5  92   79.0  89  83
8  304040250819   李倩   6   93.5  61   44.0  45  32
9  304040250912  江明悅   6    0.0   0    0.0   0   0
>>> pd.read_csv('/Users/similarface/Downloads/jnn.csv',index_col='准考證號')
               姓名  班級     語文  數學     英語  化學  物理
准考證號                                           
304040250124   羅茜   1  101.0  94  102.5  79  74
304040250128  沈怡君   1   91.5  96   69.0  82  69
304040250321   魏華   2   74.0  28   42.0  56  56
304040250233  何仕林   2   60.5  42   34.5  49  46
304040250725   屈妮   5   93.5  63   77.5  55  66
304040250709  鄧培蓓   5  102.5  81   47.0  65  58
304040250805  鄭清霞   5   89.0  80   63.5  63  65
304040250827   明楊   6  108.5  92   79.0  89  83
304040250819   李倩   6   93.5  61   44.0  45  32
304040250912  江明悅   6    0.0   0    0.0   0   0

#數量不定的空白符分割
>>> result=pd.read_table('ext3.txt',sep='\s+')

#忽略的行數
>>> pd.read_csv('/Users/similarface/Downloads/jnn.csv',index_col='准考證號',skiprows=[5,9])
               姓名  班級     語文  數學     英語  化學  物理
准考證號                                           
304040250124   羅茜   1  101.0  94  102.5  79  74
304040250128  沈怡君   1   91.5  96   69.0  82  69
304040250321   魏華   2   74.0  28   42.0  56  56
304040250233  何仕林   2   60.5  42   34.5  49  46
304040250709  鄧培蓓   5  102.5  81   47.0  65  58
304040250805  鄭清霞   5   89.0  80   63.5  63  65
304040250827   明楊   6  108.5  92   79.0  89  83
304040250912  江明悅   6    0.0   0    0.0   0   0

#缺失值的填充
NA -1.#IND NULL
>>> os.system('cat  /Users/similarface/Downloads/ex5.csv')
something,a,b,c,d,message
one,1,2,IND,4,NA
tow,-1,-1,,8,world
three,.,10,11,NULL,foo
>>> pd.read_csv('/Users/similarface/Downloads/ex5.csv',na_values=['NULL'])
  something   a   b    c    d message
0       one   1   2  IND  4.0     NaN
1       tow  -1  -1  NaN  8.0   world
2     three   .  10   11  NaN     foo
#指定空值
>>> pd.read_csv('/Users/similarface/Downloads/ex5.csv',na_values=['-1'])
  something    a     b    c    d message
0       one    1   2.0  IND  4.0     NaN
1       tow  NaN   NaN  NaN  8.0   world
2     three    .  10.0   11  NaN     foo
>>> sentinels={'message':['foo','NA'],'something':['tow']}
>>> pd.read_csv('/Users/similarface/Downloads/ex5.csv',na_values=sentinels)
  something   a   b    c    d message
0       one   1   2  IND  4.0     NaN
1       NaN  -1  -1  NaN  8.0   world
2     three   .  10   11  NaN     NaN

'''
filepath_or_buffer : str, pathlib.Path, py._path.local.LocalPath or any object with a read() method (such as a file handle or StringIO)
The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. For instance, a local file could be file ://localhost/path/to/table.csv
sep : str, default ‘,’
Delimiter to use. If sep is None, will try to automatically determine this. Regular expressions are accepted and will force use of the python parsing engine and will ignore quotes in the data.
delimiter : str, default None
Alternative argument name for sep.
header : int or list of ints, default ‘infer’
Row number(s) to use as the column names, and the start of the data. Default behavior is as if set to 0 if no names passed, otherwise None. Explicitly pass header=0 to be able to replace existing names. The header can be a list of integers that specify row locations for a multi-index on the columns e.g. [0,1,3]. Intervening rows that are not specified will be skipped (e.g. 2 in this example is skipped). Note that this parameter ignores commented lines and empty lines if skip_blank_lines=True, so header=0 denotes the first line of data rather than the first line of the file.
names : array-like, default None
List of column names to use. If file contains no header row, then you should explicitly pass header=None
index_col : int or sequence or False, default None
Column to use as the row labels of the DataFrame. If a sequence is given, a MultiIndex is used. If you have a malformed file with delimiters at the end of each line, you might consider index_col=False to force pandas to _not_ use the first column as the index (row names)
usecols : array-like, default None
Return a subset of the columns. Results in much faster parsing time and lower memory usage.
squeeze : boolean, default False
If the parsed data only contains one column then return a Series
prefix : str, default None
Prefix to add to column numbers when no header, e.g. ‘X’ for X0, X1, ...
mangle_dupe_cols : boolean, default True
Duplicate columns will be specified as ‘X.0’...’X.N’, rather than ‘X’...’X’
dtype : Type name or dict of column -> type, default None
Data type for data or columns. E.g. {‘a’: np.float64, ‘b’: np.int32} (Unsupported with engine=’python’). Use str or object to preserve and not interpret dtype.
engine : {‘c’, ‘python’}, optional
Parser engine to use. The C engine is faster while the python engine is currently more feature-complete.
converters : dict, default None
Dict of functions for converting values in certain columns. Keys can either be integers or column labels
true_values : list, default None
Values to consider as True
false_values : list, default None
Values to consider as False
skipinitialspace : boolean, default False
Skip spaces after delimiter.
skiprows : list-like or integer, default None
Line numbers to skip (0-indexed) or number of lines to skip (int) at the start of the file
skipfooter : int, default 0
Number of lines at bottom of file to skip (Unsupported with engine=’c’)
nrows : int, default None
Number of rows of file to read. Useful for reading pieces of large files
na_values : str or list-like or dict, default None
Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted as NaN: ‘’, ‘#N/A’, ‘#N/A N/A’, ‘#NA’, ‘-1.#IND’, ‘-1.#QNAN’, ‘-NaN’, ‘-nan’, ‘1.#IND’, ‘1.#QNAN’, ‘N/A’, ‘NA’, ‘NULL’, ‘NaN’, ‘nan’.
keep_default_na : bool, default True
If na_values are specified and keep_default_na is False the default NaN values are overridden, otherwise they’re appended to.
na_filter : boolean, default True
Detect missing value markers (empty strings and the value of na_values). In data without any NAs, passing na_filter=False can improve the performance of reading a large file
verbose : boolean, default False
Indicate number of NA values placed in non-numeric columns
skip_blank_lines : boolean, default True
If True, skip over blank lines rather than interpreting as NaN values
parse_dates : boolean or list of ints or names or list of lists or dict, default False
boolean. If True -> try parsing the index.
list of ints or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 each as a separate date column.
list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
a single date column.
dict, e.g. {‘foo’ : [1, 3]} -> parse columns 1, 3 as date and call result ‘foo’
Note: A fast-path exists for iso8601-formatted dates.
infer_datetime_format : boolean, default False
If True and parse_dates is enabled for a column, attempt to infer the datetime format to speed up the processing
keep_date_col : boolean, default False
If True and parse_dates specifies combining multiple columns then keep the original columns.
date_parser : function, default None
Function to use for converting a sequence of string columns to an array of datetime instances. The default uses dateutil.parser.parser to do the conversion. Pandas will try to call date_parser in three different ways, advancing to the next if an exception occurs: 1) Pass one or more arrays (as defined by parse_dates) as arguments; 2) concatenate (row-wise) the string values from the columns defined by parse_dates into a single array and pass that; and 3) call date_parser once for each row using one or more strings (corresponding to the columns defined by parse_dates) as arguments.
dayfirst : boolean, default False
DD/MM format dates, international and European format
iterator : boolean, default False
Return TextFileReader object for iteration or getting chunks with get_chunk().
chunksize : int, default None
Return TextFileReader object for iteration. See IO Tools docs for more information on iterator and chunksize.
compression : {‘infer’, ‘gzip’, ‘bz2’, None}, default ‘infer’
For on-the-fly decompression of on-disk data. If ‘infer’, then use gzip or bz2 if filepath_or_buffer is a string ending in ‘.gz’ or ‘.bz2’, respectively, and no decompression otherwise. Set to None for no decompression.
thousands : str, default None
Thousands separator
decimal : str, default ‘.’
Character to recognize as decimal point (e.g. use ‘,’ for European data).
lineterminator : str (length 1), default None
Character to break file into lines. Only valid with C parser.
quotechar : str (length 1), optional
The character used to denote the start and end of a quoted item. Quoted items can include the delimiter and it will be ignored.
quoting : int or csv.QUOTE_* instance, default None
Control field quoting behavior per csv.QUOTE_* constants. Use one of QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3). Default (None) results in QUOTE_MINIMAL behavior.
escapechar : str (length 1), default None
One-character string used to escape delimiter when quoting is QUOTE_NONE.
comment : str, default None
Indicates remainder of line should not be parsed. If found at the beginning of a line, the line will be ignored altogether. This parameter must be a single character. Like empty lines (as long as skip_blank_lines=True), fully commented lines are ignored by the parameter header but not by skiprows. For example, if comment=’#’, parsing ‘#emptyna,b,cn1,2,3’ with header=0 will result in ‘a,b,c’ being treated as the header.
encoding : str, default None
Encoding to use for UTF when reading/writing (ex. ‘utf-8’). List of Python standard encodings
dialect : str or csv.Dialect instance, default None
If None defaults to Excel dialect. Ignored if sep longer than 1 char See csv.Dialect documentation for more details
tupleize_cols : boolean, default False
Leave a list of tuples on columns as is (default is to convert to a Multi Index on the columns)
error_bad_lines : boolean, default True
Lines with too many fields (e.g. a csv line with too many commas) will by default cause an exception to be raised, and no DataFrame will be returned. If False, then these “bad lines” will dropped from the DataFrame that is returned. (Only valid with C parser)
warn_bad_lines : boolean, default True
If error_bad_lines is False, and warn_bad_lines is True, a warning for each “bad line” will be output. (Only valid with C parser).
'''
#數據寫入
data.to_csv('文件名/sys.stout',sep='|',index=True/False,headers=TRUE/FALSE,cols=[選取的列])
#數據庫操作
import pandas as pd
from pandas import *
import sqlite3
query="""
create table test(
a varchar(20),b VARCHAR(20),c REAL ,d INTEGER
);
"""
con=sqlite3.connect(':memory')
con.execute(query)
con.commit()
data=[('Atlanta','Georgia',1.25,6),
      ('Tallahassee','Florida',2.6,3),
      ('Sacramento','California',1.7,5)
       ]
stmt="INSERT INTO test VALUES (?,?,?,?)"
con.executemany(stmt,data)
con.commit()
cursor=con.execute('select * from test')
rows=cursor.fetchall()
DataFrame(rows,columns=zip(*cursor.description)[0])
#直接寫sql讀取dataFrame
import pandas.io.sql as sql
sql.read_sql('select * from test',con)



#合並數據集
>>> df1 = DataFrame(
...     {'key': ['北京大學', '四川大學', '天津大學', '山東大學', '清華大學'],
...      'major0': ['計算機','生物','化學','物理','醫學']
...     })
>>> df2 = DataFrame(
...     {'key': ['北京大學', '四川大學', '雲南大學'],
...      'major1': ['外國語', '口腔', '旅游']
... })
>>> df1
    key major0
0  北京大學    計算機
1  四川大學     生物
2  天津大學     化學
3  山東大學     物理
4  清華大學     醫學
>>> df2
    key major1
0  北京大學    外國語
1  四川大學     口腔
2  雲南大學     旅游

>>> pd.merge(df1,df2)
    key major0 major1
0  北京大學    計算機    外國語
1  四川大學     生物     口腔

>>> df3 = DataFrame(
...     {'lkey': ['北京大學', '四川大學', '天津大學', '山東大學', '清華大學'],
...      'major0': ['計算機','生物','化學','物理','醫學']
...     })
>>> df4 = DataFrame(
...     {'rkey': ['北京大學', '四川大學', '雲南大學'],
...      'major1': ['外國語', '口腔', '旅游']
... })

>>> df3
   lkey major0
0  北京大學    計算機
1  四川大學     生物
2  天津大學     化學
3  山東大學     物理
4  清華大學     醫學
>>> df4
  major1  rkey
0    外國語  北京大學
1     口腔  四川大學
2     旅游  雲南大學

>>> pd.merge(df3,df4,left_on='lkey',right_on='rkey')
   lkey major0 major1  rkey
0  北京大學    計算機    外國語  北京大學
1  四川大學     生物     口腔  四川大學
#外連接
>>> pd.merge(df3,df4,left_on='lkey',right_on='rkey',how='outer')
   lkey major0 major1  rkey
0  北京大學    計算機    外國語  北京大學
1  四川大學     生物     口腔  四川大學
2  天津大學     化學    NaN   NaN
3  山東大學     物理    NaN   NaN
4  清華大學     醫學    NaN   NaN
5   NaN    NaN     旅游  雲南大學
#左連接
>>> pd.merge(df3,df4,left_on='lkey',right_on='rkey',how='left')
   lkey major0 major1  rkey
0  北京大學    計算機    外國語  北京大學
1  四川大學     生物     口腔  四川大學
2  天津大學     化學    NaN   NaN
3  山東大學     物理    NaN   NaN
4  清華大學     醫學    NaN   NaN
#右連接
>>> pd.merge(df3,df4,left_on='lkey',right_on='rkey',how='right')
   lkey major0 major1  rkey
0  北京大學    計算機    外國語  北京大學
1  四川大學     生物     口腔  四川大學
2   NaN    NaN     旅游  雲南大學
#內連接
>>> pd.merge(df3,df4,left_on='lkey',right_on='rkey',how='inner')
   lkey major0 major1  rkey
0  北京大學    計算機    外國語  北京大學
1  四川大學     生物     口腔  四川大學

#多個鍵進行合並

left=DataFrame({
    'key1':['foo','foo','bar'],
    'key2':['one','two','one'],
    'lval':[1,2,3]
})

right=DataFrame({
    'key1':['foo','foo','bar','bar'],
    'key2':['one','one','one','two'],
    'lval':[4,5,6,7]
})

>>> pd.merge(left,right,on=['key1','key2'],how='outer')
  key1 key2  lval_x  lval_y
0  foo  one     1.0     4.0
1  foo  one     1.0     5.0
2  foo  two     2.0     NaN
3  bar  one     3.0     6.0
4  bar  two     NaN     7.0

#重復列名的處理
>>> pd.merge(left,right,on='key1',suffixes=('_lef','_right'))
  key1 key2_lef  lval_lef key2_right  lval_right
0  foo      one         1        one           4
1  foo      one         1        one           5
2  foo      two         2        one           4
3  foo      two         2        one           5
4  bar      one         3        one           6
5  bar      one         3        two           7

#索引上的合並
>>> right1=DataFrame({'group_val':[3.5,7]},index=['a','b'])
>>> left1=DataFrame({'key':['a','b','a','a','b','c'],'value':range(6)})
#合並根據索引對比
>>> pd.merge(left1,right1,left_on='key',right_index=True)
  key  value  group_val
0   a      0        3.5
2   a      2        3.5
3   a      3        3.5
1   b      1        7.0
4   b      4        7.0

lefth=DataFrame(
    {'key1':['similar','similar','similar','face','face'],
     'key2':[2000,2001,2002,2001,2002],
     'data':np.arange(5.)
     })

righth=DataFrame(np.arange(12).reshape((6,2)),
                 index=[['face','face','similar','similar','similar','similar'],
                        [2001,2000,2000,2000,2001,2002]
                        ],
                 columns=['event1','event2']
                 )
>>> lefth
   data     key1  key2
0   0.0  similar  2000
1   1.0  similar  2001
2   2.0  similar  2002
3   3.0     face  2001
4   4.0     face  2002
>>> righth
              event1  event2
face    2001       0       1
        2000       2       3
similar 2000       4       5
        2000       6       7
        2001       8       9
        2002      10      11

>>> pd.merge(lefth,righth,left_on=['key1','key2'],right_index=True)
   data     key1  key2  event1  event2
0   0.0  similar  2000       4       5
0   0.0  similar  2000       6       7
1   1.0  similar  2001       8       9
2   2.0  similar  2002      10      11
3   3.0     face  2001       0       1

>>> left2=DataFrame([[1.,2.],[3.,4.],[5.,6.]],index=['a','c','e'],columns=['similar','face'])
>>> left2
   similar  face
a      1.0   2.0
c      3.0   4.0
e      5.0   6.0
>>> right2=DataFrame([[7.,8.],[9.,10.],[11.,12.],[13.,14.]],index=['b','c','d','e'],columns=['M','A'])
>>> right2
      M     A
b   7.0   8.0
c   9.0  10.0
d  11.0  12.0
e  13.0  14.0

>>> pd.merge(left2,right2,how='outer',left_index=True,right_index=True)
   similar  face     M     A
a      1.0   2.0   NaN   NaN
b      NaN   NaN   7.0   8.0
c      3.0   4.0   9.0  10.0
d      NaN   NaN  11.0  12.0
e      5.0   6.0  13.0  14.0
>>> left2.join(right2,how='outer')
   similar  face     M     A
a      1.0   2.0   NaN   NaN
b      NaN   NaN   7.0   8.0
c      3.0   4.0   9.0  10.0
d      NaN   NaN  11.0  12.0
e      5.0   6.0  13.0  14.0
>>> another=DataFrame([[7,8],[9,10],[11,12],[16,17]],index=['a','c','e','f'],columns=['NK','O'])
>>> left2.join([right2,another])
   similar  face     M     A  NK   O
a      1.0   2.0   NaN   NaN   7   8
c      3.0   4.0   9.0  10.0   9  10
e      5.0   6.0  13.0  14.0  11  12

#軸向連接
>>> arr=np.arange(12).reshape((3,4))
>>> arr
array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])
>>> np.concatenate([arr,arr],axis=1)
array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])
>>> s1=Series([0,1],index=['a','b'])
>>> s2=Series([2,3,4],index=['c','d','e'])
>>> s3=Series([5,6],index=['f','g'])
>>> s1
a    0
b    1
dtype: int64
>>> s2
c    2
d    3
e    4
dtype: int64
>>> s3
f    5
g    6
dtype: int64
>>> pd.concat([s1,s2,s3])
a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64
>>> pd.concat([s1,s2,s3,s1])
a    0
b    1
c    2
d    3
e    4
f    5
g    6
a    0
b    1
dtype: int64
>>> pd.concat([s1,s2,s3,s1],axis=1)
     0    1    2    3
a  0.0  NaN  NaN  0.0
b  1.0  NaN  NaN  1.0
c  NaN  2.0  NaN  NaN
d  NaN  3.0  NaN  NaN
e  NaN  4.0  NaN  NaN
f  NaN  NaN  5.0  NaN
g  NaN  NaN  6.0  NaN

df1=DataFrame(np.arange(6).reshape(3,2),index=['a','b','c'],columns=['one','two'])
df2=DataFrame(5+np.arange(4).reshape(2,2),index=['a','c'],columns=['three','four'])
>>> pd.concat([df1,df2],axis=1,keys=['level1','level2'])
  level1     level2     
     one two  three four
a      0   1    5.0  6.0
b      2   3    NaN  NaN
c      4   5    7.0  8.0
>>> pd.concat({'level1':df1,'level2':df2},axis=1)
  level1     level2     
     one two  three four
a      0   1    5.0  6.0
b      2   3    NaN  NaN
c      4   5    7.0  8.0
>>> pd.concat([df1,df2],axis=1,keys=['L1','L2'],names=['u','l'])
u  L1        L2     
l one two three four
a   0   1   5.0  6.0
b   2   3   NaN  NaN
c   4   5   7.0  8.0
>>> df1=DataFrame(np.random.randn(3,4),columns=['a','b','c','d'])
>>> df2=DataFrame(np.random.randn(2,3),columns=['b','d','a'])
>>> df1
          a         b         c         d
0 -1.487358  0.077565  0.209403 -0.712507
1  1.990047 -0.221415  1.381161 -0.876811
2 -0.153150  0.391847  1.180728 -0.972548
>>> df2
          b         d         a
0 -0.200611  0.321759 -0.201620
1 -1.842735 -1.924933  0.281712

>>> pd.concat([df1,df2])
          a         b         c         d
0 -1.487358  0.077565  0.209403 -0.712507
1  1.990047 -0.221415  1.381161 -0.876811
2 -0.153150  0.391847  1.180728 -0.972548
0 -0.201620 -0.200611       NaN  0.321759
1  0.281712 -1.842735       NaN -1.924933

>>> pd.concat([df1,df2],ignore_index=True)
          a         b         c         d
0 -1.487358  0.077565  0.209403 -0.712507
1  1.990047 -0.221415  1.381161 -0.876811
2 -0.153150  0.391847  1.180728 -0.972548
3 -0.201620 -0.200611       NaN  0.321759
4  0.281712 -1.842735       NaN -1.924933

>>> pd.concat([df1,df2],ignore_index=True,axis=1)
          0         1         2         3         4         5         6
0 -1.487358  0.077565  0.209403 -0.712507 -0.200611  0.321759 -0.201620
1  1.990047 -0.221415  1.381161 -0.876811 -1.842735 -1.924933  0.281712
2 -0.153150  0.391847  1.180728 -0.972548       NaN       NaN       NaN

>>> b[:-2]
f    0.0
e    1.0
d    2.0
c    3.0
dtype: float64
>>> a[2:]
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64
>>> b[:-2].combine_first(a[2:])
a    NaN
b    4.5
c    3.0
d    2.0
e    1.0
f    0.0
dtype: float64

>>> df1=DataFrame({'a':[1,np.nan,5,np.nan],'b':[np.nan,2,np.nan,6],'c':range(2,18,4)})
>>> df2=DataFrame({'a':[5,4,np.nan,3,7],'b':[np.nan,3,4,6,8]})
>>> df2
     a    b
0  5.0  NaN
1  4.0  3.0
2  NaN  4.0
3  3.0  6.0
4  7.0  8.0
>>> df1
     a    b   c
0  1.0  NaN   2
1  NaN  2.0   6
2  5.0  NaN  10
3  NaN  6.0  14
>>> df1.combine_first(df2)
     a    b     c
0  1.0  NaN   2.0
1  4.0  2.0   6.0
2  5.0  4.0  10.0
3  3.0  6.0  14.0
4  7.0  8.0   NaN

#重塑和軸向旋轉
>>> data=DataFrame(np.arange(6).reshape((2,3)),index=pd.Index(['similar','face'],name='state'),columns=pd.Index(['one','two','three'],name='number'))
>>> data
number   one  two  three
state                   
similar    0    1      2
face       3    4      5
>>> data.stack()
state    number
similar  one       0
         two       1
         three     2
face     one       3
         two       4
         three     5
dtype: int64
>>> data.stack().unstack()
number   one  two  three
state                   
similar    0    1      2
face       3    4      5

>>> data.stack().unstack(0)
state   similar  face
number               
one           0     3
two           1     4
three         2     5

>>> data.stack().unstack('state')
state   similar  face
number               
one           0     3
two           1     4
three         2     5

>>> s1=Series([0,1,2,3],index=['a','b','c','d'])
>>> s2=Series([4,5,6],index=['c','d','e'])
>>> s1
a    0
b    1
c    2
d    3
dtype: int64
>>> s2
c    4
d    5
e    6
dtype: int64
>>> pd.concat([s1,s2],keys=['one','two'])
one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: int64
>>> pd.concat([s1,s2],keys=['one','two']).unstack()
       a    b    c    d    e
one  0.0  1.0  2.0  3.0  NaN
two  NaN  NaN  4.0  5.0  6.0
>>> pd.concat([s1,s2],keys=['one','two']).unstack().stack()
one  a    0.0
     b    1.0
     c    2.0
     d    3.0
two  c    4.0
     d    5.0
     e    6.0
dtype: float64
>>> pd.concat([s1,s2],keys=['one','two']).unstack().stack(dropna=False)
one  a    0.0
     b    1.0
     c    2.0
     d    3.0
     e    NaN
two  a    NaN
     b    NaN
     c    4.0
     d    5.0
     e    6.0
dtype: float64


#利用函數進行數據轉換

data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami',
                           'corned beef', 'Bacon', 'pastrami', 'honey ham',
                           'nova lox'],
                  'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

meat_to_animal = {
  'bacon': 'pig',
  'pulled pork': 'pig',
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}

data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
>>> data
          food  ounces  animal
0        bacon     4.0     pig
1  pulled pork     3.0     pig
2        bacon    12.0     pig
3     Pastrami     6.0     cow
4  corned beef     7.5     cow
5        Bacon     8.0     pig
6     pastrami     3.0     cow
7    honey ham     5.0     pig
8     nova lox     6.0  salmon

>>> data['food'].map(lambda x: meat_to_animal[x.lower()])
0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

離散化和面元划分:
#指定組名稱
>>> group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
>>> pd.cut(ages,bins,labels=group_names)
[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]
#等長面元 將下面的隨機數分4段 precision小數點位數
>>> data=np.random.rand(20)
>>> data
array([ 0.42519089,  0.18981873,  0.29726754,  0.37843724,  0.31072184,
        0.20240683,  0.99244468,  0.61880299,  0.9948212 ,  0.32893834,
        0.87701908,  0.25638677,  0.02344737,  0.15162624,  0.31874342,
        0.16534997,  0.43495775,  0.83059911,  0.57975644,  0.53763544])
>>> pd.cut(data,4,precision=2)
[(0.27, 0.51], (0.022, 0.27], (0.27, 0.51], (0.27, 0.51], (0.27, 0.51], ..., (0.022, 0.27], (0.27, 0.51], (0.75, 0.99], (0.51, 0.75], (0.51, 0.75]]
Length: 20
Categories (4, object): [(0.022, 0.27] < (0.27, 0.51] < (0.51, 0.75] < (0.75, 0.99]]
#分段求值
>>> pd.value_counts(cats)
(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64
#左閉右開
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

#檢查、過濾異常值
>>> np.random.seed(12345)
>>> data=DataFrame(np.random.randn(1000,4))
>>> data.describe()
                 0            1            2            3
count  1000.000000  1000.000000  1000.000000  1000.000000
mean     -0.067684     0.067924     0.025598    -0.002298
std       0.998035     0.992106     1.006835     0.996794
min      -3.428254    -3.548824    -3.184377    -3.745356
25%      -0.774890    -0.591841    -0.641675    -0.644144
50%      -0.116401     0.101143     0.002073    -0.013611
75%       0.616366     0.780282     0.680391     0.654328
max       3.366626     2.653656     3.260383     3.927528
>>> col=data[3]
>>> col[np.abs(col)>3]
97     3.927528
305   -3.399312
400   -3.745356
Name: 3, dtype: float64

#隨機重排序
>>> sampler=np.random.permutation(5)
>>> df.take(sampler)
    0   1   2   3
4  16  17  18  19
2   8   9  10  11
1   4   5   6   7
3  12  13  14  15
0   0   1   2   3

>>> df.take(np.random.permutation(len(df))[:3])
   0  1   2   3
1  4  5   6   7
2  8  9  10  11
0  0  1   2   3


#給定數組的值生成大集合
>>> bag=np.array([5,7,-1,6,4])
>>> sampler=np.random.randint(0,len(bag),size=10)
>>> sampler
array([1, 0, 4, 1, 2, 1, 4, 4, 3, 4])
>>> draws=bag.take(sampler)
>>> draws
array([ 7,  5,  4,  7, -1,  7,  4,  4,  6,  4])

#啞變量矩陣 和 指標矩陣  
@某一列出現與否的矩陣
>>> df=DataFrame({'key':['b','b','a','c','a','b'],'data1':range(6)})
>>> df
   data1 key
0      0   b
1      1   b
2      2   a
3      3   c
4      4   a
5      5   b
>>> pd.get_dummies(df['key'])
     a    b    c
0  0.0  1.0  0.0
1  0.0  1.0  0.0
2  1.0  0.0  0.0
3  0.0  0.0  1.0
4  1.0  0.0  0.0
5  0.0  1.0  0.0

#
>>> dummies=pd.get_dummies(df['key'],prefix='key')
>>> dummies
   key_a  key_b  key_c
0    0.0    1.0    0.0
1    0.0    1.0    0.0
2    1.0    0.0    0.0
3    0.0    0.0    1.0
4    1.0    0.0    0.0
5    0.0    1.0    0.0
>>> df_with_dummy=df[['data1']].join(dummies)
>>> df_with_dummy
   data1  key_a  key_b  key_c
0      0    0.0    1.0    0.0
1      1    0.0    1.0    0.0
2      2    1.0    0.0    0.0
3      3    0.0    0.0    1.0
4      4    1.0    0.0    0.0
5      5    0.0    1.0    0.0


>>> values
array([ 0.86789062,  0.4187927 ,  0.48191735,  0.44540277,  0.6855452 ,
        0.33193716,  0.20772778,  0.21461227,  0.50985294,  0.95327048])
>>> 
>>> bins=[0,0.2,0.4,0.6,0.8,1]
>>> pd.get_dummies(pd.cut(values,bins))
   (0, 0.2]  (0.2, 0.4]  (0.4, 0.6]  (0.6, 0.8]  (0.8, 1]
0       0.0         0.0         0.0         0.0       1.0
1       0.0         0.0         1.0         0.0       0.0
2       0.0         0.0         1.0         0.0       0.0
3       0.0         0.0         1.0         0.0       0.0
4       0.0         0.0         0.0         1.0       0.0
5       0.0         1.0         0.0         0.0       0.0
6       0.0         1.0         0.0         0.0       0.0
7       0.0         1.0         0.0         0.0       0.0
8       0.0         0.0         1.0         0.0       0.0
9       0.0         0.0         0.0         0.0       1.0

#電子郵件正則
>>> pattern=r'([A-Z0-9.%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
>>> regex=re.compile(pattern,flags=re.IGNORECASE)
>>> regex.match('jaflfbs@sina.com')
<_sre.SRE_Match object at 0x111ceab78>
>>> m=regex.match('jaflfbs@sina.com')
>>> m.groups()
('jaflfbs', 'sina', 'com')



#分組 group by groupby
>>> df=DataFrame({'key1':['a','a','b','b','a'],'key2':['one','two','one','tow','one'],'data1':np.random.randn(5),'data2':np.random.randn(5)})
>>> df
      data1     data2 key1 key2
0 -0.893905  0.311668    a  one
1  1.274761  0.885820    a  two
2  1.115914  0.887069    b  one
3  0.054165  0.267643    b  tow
4 -0.819516  0.933495    a  one
>>> grouped=df['data1'].groupby(df['key1'])
>>> grouped
<pandas.core.groupby.SeriesGroupBy object at 0x111e11e10>
>>> grouped.mean()
key1
a   -0.14622
b    0.58504
Name: data1, dtype: float64
>>> means=df['data1'].groupby([df['key1'],df['key2']]).mean()
>>> means
key1  key2
a     one    -0.856710
      two     1.274761
b     one     1.115914
      tow     0.054165
Name: data1, dtype: float64

>>> means.unstack()
key2       one       tow       two
key1                              
a    -0.856710       NaN  1.274761
b     1.115914  0.054165       NaN

#可以具體制定 分組的列
>>> states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
>>> years = np.array([2005, 2005, 2006, 2005, 2006])
>>> df['data1'].groupby([states,years]).mean()
#分組的可以是列名 key2沒有出現 因為key2不是數值類型的
>>> df.groupby('key1').mean()
        data1     data2
key1                   
a    -0.14622  0.710328
b     0.58504  0.577356

>>> df.groupby(['key1','key2']).mean()
              data1     data2
key1 key2                    
a    one  -0.856710  0.622582
     two   1.274761  0.885820
b    one   1.115914  0.887069
     tow   0.054165  0.267643

#獲取分組的大小
>>> df.groupby(['key1','key2']).size()
key1  key2
a     one     2
      two     1
b     one     1
      tow     1

#
>>> pieces=dict(list(df.groupby('key1')))
>>> pieces['b']
      data1     data2 key1 key2
2  1.115914  0.887069    b  one
3  0.054165  0.267643    b  tow






############時間操作
>>> from datetime import datetime
>>> now=datetime.now()
>>> now
datetime.datetime(2016, 4, 12, 14, 31, 50, 995484)
>>> now.year,now.month,now.day
(2016, 4, 12)
>>> now.day
12
>>> #delta以毫秒形式存儲日期和時間 datetime.timedelta表示lia
>>> delta=datetime(2016,5,1)-datetime(2016,5,2)
>>> delta
datetime.timedelta(-1)
>>> delta.days
-1
>>> delta.seconds
0
>>> from datetime import timedelta
>>> start=datetime(2011,1,1)
>>> start+timedelta(12)
datetime.datetime(2011, 1, 13, 0, 0)
>>> start-2*timedelta(12)
datetime.datetime(2010, 12, 8, 0, 0)
>>> stamp=datetime(2011,1,3)
>>> str(stamp)
'2011-01-03 00:00:00'
>>> value='2016-01-01'
>>> datetime.strptime(value,'%Y-%m-%d')
datetime.datetime(2016, 1, 1, 0, 0)
>>> value='2016-01-13'
>>> datetime.strptime(value,'%Y-%m-%d')
datetime.datetime(2016, 1, 13, 0, 0)
>>> value='2016-13-13'
>>> datetime.strptime(value,'%Y-%m-%d')
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/_strptime.py", line 325, in _strptime
    (data_string, format))
ValueError: time data '2016-13-13' does not match format '%Y-%m-%d

>>> datestrs=['7/6/2016','1/1/1111']
>>> [datetime.strptime(x,'%m/%d/%Y')  for x in datestrs]
[datetime.datetime(2016, 7, 6, 0, 0), datetime.datetime(1111, 1, 1, 0, 0)]

>>> from dateutil.parser import parse
>>> parse('2016-01-09')
datetime.datetime(2016, 1, 9, 0, 0)
>>> parse('Jan 31,2015 10:31 PM')
datetime.datetime(2015, 1, 31, 22, 31)
>>> parse('1/3/2018',dayfirst=True)
datetime.datetime(2018, 3, 1, 0, 0)
>>> parse('1/3/2018',dayfirst=False)
datetime.datetime(2018, 1, 3, 0, 0)


>>> datestrs=['1/4/2016','4/1/2017']
>>> pd.to_datetime(datestrs)
DatetimeIndex(['2016-01-04', '2017-04-01'], dtype='datetime64[ns]', freq=None)
>>> idx=pd.to_datetime(datestrs+[None])
>>> idx
DatetimeIndex(['2016-01-04', '2017-04-01', 'NaT'], dtype='datetime64[ns]', freq=None)

>>> pd.isnull(idx)
array([False, False,  True], dtype=bool)


>>> dates=[datetime(2011,1,2),datetime(2016,1,1),datetime(2016,1,2),datetime(2016,1,3),datetime(2016,1,4),datetime(2016,1,5)]
>>> dates
[datetime.datetime(2011, 1, 2, 0, 0), datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 1, 2, 0, 0), datetime.datetime(2016, 1, 3, 0, 0), datetime.datetime(2016, 1, 4, 0, 0), datetime.datetime(2016, 1, 5, 0, 0)]
>>> from pandas import *
>>> ts=Series(np.random.randn(6),index=dates)
>>> ts
2011-01-02    0.734018
2016-01-01    1.661590
2016-01-02    0.839504
2016-01-03   -1.295834
2016-01-04    0.190545
2016-01-05    0.267724
dtype: float64


>>> ts+ts[::2]
2011-01-02    1.468037
2016-01-01         NaN
2016-01-02    1.679008
2016-01-03         NaN
2016-01-04    0.381091
2016-01-05         NaN
dtype: float64

>>> ts.index.dtype
dtype('<M8[ns]')
>>> stamp=ts.index[0]
>>> stamp
Timestamp('2011-01-02 00:00:00')
>>> stamp=ts.index[2]
>>> ts[stamp]
0.83950398236998658
>>> ts['1/1/2016']
1.6615901161098698

>>> longer_ts=Series(np.random.randn(1000),index=pd.date_range('1/1/2000',periods=1000))
>>> longer_ts['2002-09-21':'2002-09-23']
2002-09-21   -0.105898
2002-09-22    1.708342
2002-09-23   -0.815799
Freq: D, dtype: float64
>>> longer_ts['2002-09-21':'09/23/2002']
2002-09-21   -0.105898
2002-09-22    1.708342
2002-09-23   -0.815799
Freq: D, dtype: float64
>>> longer_ts['2002-09-21':'23/09/2002']
2002-09-21   -0.105898
2002-09-22    1.708342
2002-09-23   -0.815799
Freq: D, dtype: float64

>>> longer_ts.truncate(before='2002-09-23')
2002-09-23   -0.815799
2002-09-24   -0.140892
2002-09-25   -0.397591
2002-09-26    0.451815
Freq: D, dtype: float64
>>> longer_ts.truncate(after='2002-09-23')

#重復時間序列

>>> dates=pd.DatetimeIndex(['1/1/2016','1/2/2016','1/2/2016','1/2/2016','1/3/2016'])
>>> dates
DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-02', '2016-01-02',
               '2016-01-03'],
              dtype='datetime64[ns]', freq=None)
>>> dup_ts=Series(range(5),index=dates)
>>> dup_ts
2016-01-01    0
2016-01-02    1
2016-01-02    2
2016-01-02    3
2016-01-03    4
dtype: int64
>>> dup_ts.index.is_unique
False
>>> dup_ts[]
  File "<stdin>", line 1
    dup_ts[]
           ^
SyntaxError: invalid syntax
>>> dup_ts['1/2/2016']
2016-01-02    1
2016-01-02    2
2016-01-02    3
dtype: int64
>>> grouped=dup_ts.groupby(level=0)
>>> grouped.mean()
2016-01-01    0
2016-01-02    2
2016-01-03    4
dtype: int64
>>> grouped.max()
2016-01-01    0
2016-01-02    3
2016-01-03    4
dtype: int64
>>> grouped.count()
2016-01-01    1
2016-01-02    3
2016-01-03    1
dtype: int64

#4-6月的日期
>>> index=pd.date_range('4/1/2016','6/1/2016')
#開始 向后多少天
>>> pd.date_range(start='4/1/2016',periods=20)
DatetimeIndex(['2016-04-01', '2016-04-02', '2016-04-03', '2016-04-04',
               '2016-04-05', '2016-04-06', '2016-04-07', '2016-04-08',
               '2016-04-09', '2016-04-10', '2016-04-11', '2016-04-12',
               '2016-04-13', '2016-04-14', '2016-04-15', '2016-04-16',
               '2016-04-17', '2016-04-18', '2016-04-19', '2016-04-20'],
              dtype='datetime64[ns]', freq='D')

>>> pd.date_range(end='2016-12-12',periods=10)
DatetimeIndex(['2016-12-03', '2016-12-04', '2016-12-05', '2016-12-06',
               '2016-12-07', '2016-12-08', '2016-12-09', '2016-12-10',
               '2016-12-11', '2016-12-12'],
              dtype='datetime64[ns]', freq='D')

>>> pd.date_range('1/1/2016','12/2/2016',freq='BM')
DatetimeIndex(['2016-01-29', '2016-02-29', '2016-03-31', '2016-04-29',
               '2016-05-31', '2016-06-30', '2016-07-29', '2016-08-31',
               '2016-09-30', '2016-10-31', '2016-11-30'],
              dtype='datetime64[ns]', freq='BM')

>>> pd.date_range('5/2/2012 12:12:12',periods=5)
DatetimeIndex(['2012-05-02 12:12:12', '2012-05-03 12:12:12',
               '2012-05-04 12:12:12', '2012-05-05 12:12:12',
               '2012-05-06 12:12:12'],
              dtype='datetime64[ns]', freq='D')
#normalize 午夜12點
>>> pd.date_range('5/2/2016 12:13:14',periods=5,normalize=True)
DatetimeIndex(['2016-05-02', '2016-05-03', '2016-05-04', '2016-05-05',
               '2016-05-06'],
              dtype='datetime64[ns]', freq='D')

>>> from pandas.tseries.offsets import Hour,Minute
>>> hour=Hour
>>> hour
<class 'pandas.tseries.offsets.Hour'>
>>> four_hours=Hour(4)
>>> four_hours
<4 * Hours>
>>> 
>>> pd.date_range('1/1/2016','1/2/2016',freq='4h')
DatetimeIndex(['2016-01-01 00:00:00', '2016-01-01 04:00:00',
               '2016-01-01 08:00:00', '2016-01-01 12:00:00',
               '2016-01-01 16:00:00', '2016-01-01 20:00:00',
               '2016-01-02 00:00:00'],
              dtype='datetime64[ns]', freq='4H')

>>> pd.date_range('1/1/2000',periods=2,freq='1h30min')
DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 01:30:00'], dtype='datetime64[ns]', freq='90T')

freq
-----------------------------
http://pandas.pydata.org/pandas-docs/version/0.18.0/timeseries.html#dateoffset-objects
-----------------------------
D 每日
B 工作日
H 小數
T 分鍾
S 秒
L 毫秒
U 微妙
M 每月最后一天
BM 每月最后一個工作日
MS 每月第一個
BMS 每月工作第一天
W-MON W-TUE[WED THU FRI SAT SUN]
WOM-1MON WOM-2MON 每月第一個星期一 。。。
Q-JAN 月份 JAN FEB MAR APR MAY JUN JUL AUG SEP OCT NOV DEC
BQ-JAN 

AS-JAN 每年指定月份的第一個日歷日
BAS-JAN BAS-FEB 每年指定月份的第一個工作日

>>> rng=pd.date_range('1/1/2016','9/1/2012',freq='WOM-3FRI')
>>> rng
DatetimeIndex([], dtype='datetime64[ns]', freq='WOM-3FRI')
>>> rng=pd.date_range('1/1/2016','9/1/2016',freq='WOM-3FRI')
>>> rng
DatetimeIndex(['2016-01-15', '2016-02-19', '2016-03-18', '2016-04-15',
               '2016-05-20', '2016-06-17', '2016-07-15', '2016-08-19'],
              dtype='datetime64[ns]', freq='WOM-3FRI')

>>> ts=Series(np.random.randn(4),index=pd.date_range('1/1/2000',periods=4,freq='M'))
>>> ts
2000-01-31    0.246254
2000-02-29    0.426385
2000-03-31    0.832971
2000-04-30    1.163773
Freq: M, dtype: float64
>>> ts.shift(2)
2000-01-31         NaN
2000-02-29         NaN
2000-03-31    0.246254
2000-04-30    0.426385
Freq: M, dtype: float64
>>> ts.shift(-2)
2000-01-31    0.832971
2000-02-29    1.163773
2000-03-31         NaN
2000-04-30         NaN
Freq: M, dtype: float64

#計算百分比變化
>>> ts/ts.shift(1)-1
2000-01-31         NaN
2000-02-29    0.731486
2000-03-31    0.953564
2000-04-30    0.397135
Freq: M, dtype: float64

>>> ts.shift(2,freq='M')
2000-03-31    0.246254
2000-04-30    0.426385
2000-05-31    0.832971
2000-06-30    1.163773
Freq: M, dtype: float64

>>> ts.shift(3,freq='D')
2000-02-03    0.246254
2000-03-03    0.426385
2000-04-03    0.832971
2000-05-03    1.163773
dtype: float64

>>> ts.shift(1,freq='3D')
2000-02-03    0.246254
2000-03-03    0.426385
2000-04-03    0.832971
2000-05-03    1.163773
dtype: float64

>>> ts.shift(1,freq='90T')
2000-01-31 01:30:00    0.246254
2000-02-29 01:30:00    0.426385
2000-03-31 01:30:00    0.832971
2000-04-30 01:30:00    1.163773
Freq: M, dtype: float64


>>> from pandas.tseries.offsets import Day,MonthEnd
>>> now=datetime(2011,11,17)
>>> now
datetime.datetime(2011, 11, 17, 0, 0)
>>> now+3*Day()
Timestamp('2011-11-20 00:00:00')
>>> now+MonthEnd()
Timestamp('2011-11-30 00:00:00')
>>> now+MonthEnd(2)
Timestamp('2011-12-31 00:00:00')

>>> offset=MonthEnd()
>>> offset.rollforward(now)
Timestamp('2011-11-30 00:00:00')
>>> now
datetime.datetime(2011, 11, 17, 0, 0)
>>> offset.rollback(now)
Timestamp('2011-10-31 00:00:00')

>>> ts=Series(np.random.randn(20),index=pd.date_range('1/12/2016',periods=20,freq='4d'))5450>>
>>> ts.groupby(offset.rollforward).mean()
2016-01-31   -0.023515
2016-02-29    0.332412
2016-03-31    0.445600
dtype: float64

>>> ts.resample('M',how='mean')
2016-01-31    0.705208
2016-02-29   -0.174444
2016-03-31    0.534282
Freq: M, dtype: float64

#時間算術運算
>>> p=pd.Period(2016,freq='A-DEC')
>>> p
Period('2016', 'A-DEC')
>>> p+5
Period('2021', 'A-DEC')
>>> p-2
Period('2014', 'A-DEC')
>>> pd.Period('2014',freq='A-DEC')-p
-2
>>> rng=pd.period_range('1/1/2016','6/30/2016',freq='M')
>>> rng
PeriodIndex(['2016-01', '2016-02', '2016-03', '2016-04', '2016-05', '2016-06'], dtype='int64', freq='M')

>>> rng=pd.period_range('1/1/2016','6/30/2016',freq='M')
>>> Series(np.random.randn(6),index=rng)
2016-01   -0.739693
2016-02   -0.928667
2016-03    0.176348
2016-04    1.343980
2016-05   -1.513816
2016-06    0.654137
Freq: M, dtype: float64

>>> values=['2010Q3','2012Q2','2013Q1']
>>> index=pd.PeriodIndex(values,freq='Q-DEC')
>>> index
PeriodIndex(['2010Q3', '2012Q2', '2013Q1'], dtype='int64', freq='Q-DEC')


#時間頻度轉換
>>> p=pd.Period('2007',freq='A-DEC')
>>> p.asfreq('M',how='start')
Period('2007-01', 'M')
>>> p.asfreq('M',how='end')
Period('2007-12', 'M')
>>> p=pd.Period('2007',freq='A-FEB')
>>> p.asfreq('M',how='start')
Period('2006-03', 'M')
>>> p.asfreq('M',how='end')
Period('2007-02', 'M')


111-1115-5954 0 0 0 0 0 13  32954144  32954144  G T exonic  BRCA2 . nonsynonymous SNV BRCA2:NM_000059:exon24:c.G9118T:p.V3040F  13q13.1
#
#
#
#
#
#
#
#
#
#

pd.value_counts(cats)

 

  


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM