DadgadCafe · June 30, 2017 03:44
diff --git a/np.py b/np.py
 import numpy as np

 # linspace：创建线段

 arr = np.array([[1, 2, 3]
                [4, 5, 6]], dtype=int32) # list to matrix

 arr.ndim # 2 dimensions
 arr.shape # 2 * 3
 arr.size # number of elements

 np.zeros((3, 4), dtype=int) # 3*4

 np.ones((3, 4), dtype=int16) # 3*4

 np.empty((3, 4), dtype=int) # 3*4, close to 0

 a = np.arange(10, 20, 2)
 a[0] #10

 a = np.arange(10, 50, 2) # 10 - 20, step 2
      .reshape((4, 5)) # reshape to 3*4
 a[0][0] #10
 a[0, 0] #same
 a[0, 1:3] # [12, 14]

 np.linspace(1, 10, 20) # 1 - 10, 20 pieces
  .reshape((5, 4))

 # matrix operation
 a1 = np.arange(5) # array([0, 1, 2, 3, 4])
 a2 = np.arange(10, 15) # array([10, 11, 12, 13, 14])

 a2 - a1 # array([10, 10, 10, 10, 10])
 a2 + a1 # array([10, 12, 14, 16, 18])
 a2 * a1 # array([ 0, 11, 24, 39, 56])
 a1 / a2 # array([ 0.        ,  0.09090909,  0.16666667,  0.23076923,  0.28571429])
 a1 ** 2 # array([ 0,  1,  4,  9, 16])
 a1 < 0 # array([False, False, False, False, False], dtype=bool)
 np.sin(a1)

 a = np.array([[1,1],[0,1]])
 b = np.arange(4).reshape((2,2)) # [[0, 1], [2, 3]]
 np.dot(a, b) # array([[2, 4], [2, 3]])
 a.dot(b) # same
 np.dot([1,2,3],[4,5,6]) # 1*4 + 2*5 + 3*6 = 32

 a = np.random.random((2,4))
 np.sum(a)
 np.max(a)
 np.min(a)
 np.sum(a, axis=0) # sum by column
 np.sum(a, axis=1) # sum by row


 A = np.arange(2,14).reshape((3,4))
 # array([[ 2, 3, 4, 5]
 #        [ 6, 7, 8, 9]
 #        [10,11,12,13]])
 np.argmax(A) # index of 2: 0
 np.argmin(A) # indeox of 13: 11
 np.mean(A) # A.mean() # 7.5
 np.median(A) # A.median()
 np.median(A, axis=0)
 np.average(A) # 7.5
 np.cumsum(A) # [2, 2+3, 2+3+4, ...] => array([ 2,  5,  9, 14, 20, 27, 35, 44, 54, 65, 77, 90])
 np.diff(A) # [[3-2, 4-3, 5-4], ...] => array([[1, 1, 1], [1, 1, 1], [1, 1, 1]])
 np.nonzero(A) # position of nonzero: (0, 0), (0, 1) ...
              #(array([0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]),
              # array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]))

 A = np.arange(14, 2, -1)
      .reshape((3,4))
 # array([[14, 13, 12, 11],
 #       [10,  9,  8,  7],
 #       [ 6,  5,  4,  3]])
 np.sort(A)
 # array([[11,12,13,14]
 #        [ 7, 8, 9,10]
 #        [ 3, 4, 5, 6]])
 np.transpose(A)
 A.T # same
 # array([[14,10, 6]
 #        [13, 9, 5]
 #        [12, 8, 4]
 #        [11, 7, 3]])
 (A.T).dot(A)
 # array([[332, 302, 272, 242],
 #        [302, 275, 248, 221],
 #        [272, 248, 224, 200],
 #        [242, 221, 200, 179]])
 np.clip(A, 5, 9) # >9 => 9; <5 => 5
 # array([[ 9, 9, 9, 9]
 #        [ 9, 9, 8, 7]
 #        [ 6, 5, 5, 5]])

 a = np.arange(4).reshape((2, 2))
 a[0, 0] # 0
 for row in a:
    print(row)

 for column in a.T:
    print(column)

 a.flatten() # array([0, 1, 2, 3])
 for item in a.flat: # iterate items
    print(item)


 a = np.array([1, 1, 1])
 b = np.array([2, 2, 2])
 np.vstack((a, b)) # array([[1, 1, 1], [2, 2, 2]])
 np.hstack((a, b)) # array([1, 1, 1, 2, 2, 2])
 a[:, np.newaxis] # array([[1], [1], [1]])
 # using concatenate
 np.concatenate((a, b, b), axis=0) # 0:vertical 1:horizontal


 a = np.arange(12).reshape((3, 4))
 np.hsplit(a, 2)
 np.split(a, 2, axis=1) # horizontal
 # [ array([[0, 1], [4, 5], [8, 9]]),
 #   array([[2, 3], [6, 7], [10, 11]])]
 np.vsplit(a, 3)
 np.split(a, 3, axis=0) # vertically
 # [array([[0, 1, 2, 3]]), array([[4, 5, 6, 7]]), array([[8, 9, 10, 11]])]
 np.array_split(A, 3, axis=1) # uneven split
 # [ array([[0, 1],[4, 5],[8, 9]]),
 #   array([[2], [6], [10]]),
 #   array([[3], [7], [11]])]

 a = np.arange(4)
 b = a
 a is b # True
 c = a.copy()
 c is a # False



 #pandas
 import pandas as pd
 s = pd.Series([1, 3, np.nan, 5])
 # 0    1.0
 # 1    3.0
 # 2    NaN
 # 3    5.0
 # dtype: float64

 dates = pd.date_range('20170101', periods=6)
 # DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04', '2017-01-05', '2017-01-06'],
 #               dtype='datetime64[ns]', freq='D')

 df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['a', 'b', 'c', 'd'])
 #                    a         b         c         d
 # 2017-01-01 -0.669733  0.091818  0.581845 -0.290370
 # 2017-01-02  0.203958 -0.840011 -1.234419  1.567374
 # 2017-01-03  0.761231 -0.712473  0.954426  2.002349
 # 2017-01-04  0.477278  0.860596  0.867349  0.438903
 # 2017-01-05 -1.431947  0.684325 -0.762821  0.815071
 # 2017-01-06 -0.095380 -0.515609  0.184032 -0.482174

 pd.DataFrame(np.arange(12).reshape((3, 4)))
 #    0  1   2   3
 # 0  0  1   2   3
 # 1  4  5   6   7
 # 2  8  9  10  11

 df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.arange(4),
                    'E': pd.Categorical(['test', 'train', 'test', 'train']),
                    'F': 'foo'})
 #      A          B    C  D      E    F
 # 0  1.0 2013-01-02  1.0  0   test  foo
 # 1  1.0 2013-01-02  1.0  1  train  foo
 # 2  1.0 2013-01-02  1.0  2   test  foo
 # 3  1.0 2013-01-02  1.0  3  train  foo
 df2.type
 df2.index
 df2.columns
 df2.rows
 df2.values
 df2.describe()
 df2.T
 df2.sort_index(axis=1, ascending=False) # sort by column name
 df2.sort_values(by='B') # sort by B column value

 # data selection
 dates = pd.date_range('20130101', periods=6)
 df = pd.DataFrame(np.arange(24).reshape((6,4)),
                  index=dates,
                  columns=['A','B','C','D'])

 '''
             A   B   C   D
 2013-01-01   0   1   2   3
 2013-01-02   4   5   6   7
 2013-01-03   8   9  10  11
 2013-01-04  12  13  14  15
 2013-01-05  16  17  18  19
 2013-01-06  20  21  22  23
 '''

 df['A']
 df.A
 '''
 2013-01-01     0
 2013-01-02     4
 2013-01-03     8
 2013-01-04    12
 2013-01-05    16
 2013-01-06    20
 Freq: D, Name: A, dtype: int64
 '''

 df[0:3]
 '''
            A  B   C   D
 2013-01-01  0  1   2   3
 2013-01-02  4  5   6   7
 2013-01-03  8  9  10  11
 '''

 df['20130102':'20130104']
 '''
 A   B   C   D
 2013-01-02   4   5   6   7
 2013-01-03   8   9  10  11
 2013-01-04  12  13  14  15
 '''

 df.loc['20130102']
 '''
 A    4
 B    5
 C    6
 D    7
 Name: 2013-01-02 00:00:00, dtype: int64
 '''

 df.loc[:,['A','B']]
 '''
             A   B
 2013-01-01   0   1
 2013-01-02   4   5
 2013-01-03   8   9
 2013-01-04  12  13
 2013-01-05  16  17
 2013-01-06  20  21
 '''

 df.loc['20130102',['A','B']]
 '''
 A    4
 B    5
 Name: 2013-01-02 00:00:00, dtype: int64
 '''

 df.iloc[3,1] # 13
 df.iloc[3:5, 1:3]
 '''
             B   C
 2013-01-04  13  14
 2013-01-05  17  18
 '''
 df.iloc[[1,3,5],1:3]
 '''
             B   C
 2013-01-02   5   6
 2013-01-04  13  14
 2013-01-06  21  22
 '''

 df.ix[:3,['A','C']]
 '''
            A   C
 2013-01-01  0   2
 2013-01-02  4   6
 2013-01-03  8  10
 '''

 # Boolean indexing:
 df[df.A>8]
 '''
             A   B   C   D
 2013-01-04  12  13  14  15
 2013-01-05  16  17  18  19
 2013-01-06  20  21  22  23
 '''


 dates = pd.date_range('20130101', periods=6)
 df = pd.DataFrame(np.arange(24).reshape((6, 4)),
                  index=dates,
                  columns=['A', 'B', 'C', 'D'])

 df.iloc[2, 2] = 111
 df.loc['20130101', 'B'] = 222
 '''
             A    B    C   D
 2013-01-01   0  222    2   3
 2013-01-02   4    5    6   7
 2013-01-03   8    9  111  11
 2013-01-04  12   13   14  15
 2013-01-05  16   17   18  19
 2013-01-06  20   21   22  23
 '''
 df.B[df.A>4] = 0
 '''
                A     B     C   D
 2013-01-01   0  2222     2   3
 2013-01-02   4     5     6   7
 2013-01-03   8     0  1111  11
 2013-01-04  12     0    14  15
 2013-01-05  16     0    18  19
 2013-01-06  20     0    22  23
 '''
 df['F'] = np.nan
 '''
             A    B    C   D   F
 2013-01-01   0  222    2   3 NaN
 2013-01-02   4    5    6   7 NaN
 2013-01-03   8    0  111  11 NaN
 2013-01-04  12    0   14  15 NaN
 2013-01-05  16    0   18  19 NaN
 2013-01-06  20    0   22  23 NaN
 '''
 df['E'] = pd.Series([1,2,3,4,5,6],
                    index=pd.date_range('20130101',
                                        periods=6))
 '''
             A     B     C   D   F  E
 2013-01-01   0  2222     2   3 NaN  1
 2013-01-02   4     5     6   7 NaN  2
 2013-01-03   8     0  1111  11 NaN  3
 2013-01-04  12     0    14  15 NaN  4
 2013-01-05  16     0    18  19 NaN  5
 2013-01-06  20     0    22  23 NaN  6
 '''

 dates = pd.date_range('20130101', periods=6)
 df = pd.DataFrame(np.arange(24).reshape((6, 4)),
                  index=dates,
                  columns=['A', 'B', 'C', 'D'])
 df.iloc[0, 1] = np.nan
 df.iloc[1, 2] = np.nan
 '''
             A     B     C   D
 2013-01-01   0   NaN   2.0   3
 2013-01-02   4   5.0   NaN   7
 2013-01-03   8   9.0  10.0  11
 2013-01-04  12  13.0  14.0  15
 2013-01-05  16  17.0  18.0  19
 2013-01-06  20  21.0  22.0  23
 '''
 df.dropna(
    axis=0,     # 0: 对行进行操作; 1: 对列进行操作
    how='any'   # 'any': 只要存在 NaN 就 drop 掉; 'all': 必须全部是 NaN 才 drop
 )
 '''
             A     B     C   D
 2013-01-03   8   9.0  10.0  11
 2013-01-04  12  13.0  14.0  15
 2013-01-05  16  17.0  18.0  19
 2013-01-06  20  21.0  22.0  23
 '''
 df.fillna(value=0)
 '''
             A     B     C   D
 2013-01-01   0   0.0   2.0   3
 2013-01-02   4   5.0   0.0   7
 2013-01-03   8   9.0  10.0  11
 2013-01-04  12  13.0  14.0  15
 2013-01-05  16  17.0  18.0  19
 2013-01-06  20  21.0  22.0  23
 '''
 df.isnull()
 '''
                A      B      C      D
 2013-01-01  False   True  False  False
 2013-01-02  False  False   True  False
 2013-01-03  False  False  False  False
 2013-01-04  False  False  False  False
 2013-01-05  False  False  False  False
 2013-01-06  False  False  False  False
 '''
 np.any(df.isnull()) # if exists nan


 # read
 data = pd.read_csv('students.csv')
 # to pickle
 data.to_pickle('student.pickle')

 # concat
 # axis, default 0
 df1 = pd.DataFrame(np.ones((3, 4))*0, columns=['a', 'b', 'c', 'd'])
 df2 = pd.DataFrame(np.ones((3, 4))*1, columns=['a', 'b', 'c', 'd'])
 df3 = pd.DataFrame(np.ones((3, 4))*2, columns=['a', 'b', 'c', 'd'])
 pd.concat([df1, df2, df3], axis=0)
 '''
     a    b    c    d
 0  0.0  0.0  0.0  0.0
 1  0.0  0.0  0.0  0.0
 2  0.0  0.0  0.0  0.0
 0  1.0  1.0  1.0  1.0
 1  1.0  1.0  1.0  1.0
 2  1.0  1.0  1.0  1.0
 0  2.0  2.0  2.0  2.0
 1  2.0  2.0  2.0  2.0
 2  2.0  2.0  2.0  2.0
 '''
 pd.concat([df1, df2, df3], axis=0, ignore_index=True)
 '''
     a    b    c    d
 0  0.0  0.0  0.0  0.0
 1  0.0  0.0  0.0  0.0
 2  0.0  0.0  0.0  0.0
 3  1.0  1.0  1.0  1.0
 4  1.0  1.0  1.0  1.0
 5  1.0  1.0  1.0  1.0
 6  2.0  2.0  2.0  2.0
 7  2.0  2.0  2.0  2.0
 8  2.0  2.0  2.0  2.0
 '''

 df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'], index=[1,2,3])
 df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'], index=[2,3,4])
 pd.concat([df1, df2], axis=0, join='outer')
 '''
    a    b    c    d    e
 1  0.0  0.0  0.0  0.0  NaN
 2  0.0  0.0  0.0  0.0  NaN
 3  0.0  0.0  0.0  0.0  NaN
 2  NaN  1.0  1.0  1.0  1.0
 3  NaN  1.0  1.0  1.0  1.0
 4  NaN  1.0  1.0  1.0  1.0
 '''

 pd.concat([df1, df2], axis=0, join='inner')
 '''
    b    c    d
 1  0.0  0.0  0.0
 2  0.0  0.0  0.0
 3  0.0  0.0  0.0
 2  1.0  1.0  1.0
 3  1.0  1.0  1.0
 4  1.0  1.0  1.0
 '''

 pd.concat([df1, df2], axis=0, join='inner', ignore_index=True)
 '''
    b    c    d
 0  0.0  0.0  0.0
 1  0.0  0.0  0.0
 2  0.0  0.0  0.0
 3  1.0  1.0  1.0
 4  1.0  1.0  1.0
 5  1.0  1.0  1.0
 '''

 # horizontal by index
 df1 = pd.DataFrame(np.ones((3, 4)),
                   columns=['A', 'B', 'C', 'D'],
                   index=[1, 2, 3])
 df2 = pd.DataFrame(np.ones((3, 4)),
                   columns=['A', 'B', 'C', 'D']
                   index=[2, 3, 4])
 pd.concat([df1, df2], axis=1, join_axes=[df1.index])

 #     a    b    c    d    b    c    d    e
 # 1  0.0  0.0  0.0  0.0  NaN  NaN  NaN  NaN
 # 2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
 # 3  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0


 df1 = pd.DataFrame(np.ones((3, 4)) * 0,
                   columns=['A', 'B', 'C', 'D'])
 df2 = pd.DataFrame(np.ones((3, 4)) * 1,
                   columns=['A', 'B', 'C', 'D'])
 df3 = pd.DataFrame(np.ones((3, 4)) * 1,
                   columns=['A', 'B', 'C', 'D'])

 df1.append(df2, ignore_index=True)
 '''
    a    b    c    d
 0  0.0  0.0  0.0  0.0
 1  0.0  0.0  0.0  0.0
 2  0.0  0.0  0.0  0.0
 3  1.0  1.0  1.0  1.0
 4  1.0  1.0  1.0  1.0
 5  1.0  1.0  1.0  1.0
 '''

 s1 = pd.Series([1,2,3,4],
               index=['a','b','c','d'])

 df1.append(s1, ignore_index=True)
 '''
 #     a    b    c    d
 # 0  0.0  0.0  0.0  0.0
 # 1  0.0  0.0  0.0  0.0
 # 2  0.0  0.0  0.0  0.0
 # 3  1.0  2.0  3.0  4.0
 '''


 left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
 right = pd.DataFrame({'key': ['K1', 'K2', 'K3', 'K4'],
                      'C': ['C0', 'C1', 'C2', 'C3'],
                      'D': ['D0', 'D1', 'D2', 'D3']})
 '''
    A   B key
 0  A0  B0  K0
 1  A1  B1  K1
 2  A2  B2  K2
 3  A3  B3  K3
 '''
 '''
    C   D key
 0  C0  D0  K1
 1  C1  D1  K2
 2  C2  D2  K3
 3  C3  D3  K4
 '''
 pd.merge(left, right, on='key')
 '''
    A   B key   C   D
 0  A1  B1  K1  C0  D0
 1  A2  B2  K2  C1  D1
 2  A3  B3  K3  C2  D2
 '''

 left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                      'key2': ['K0', 'K1', 'K0', 'K1'],
                      'A': ['A0', 'A1', 'A2', 'A3'],
                      'B': ['B0', 'B1', 'B2', 'B3']})
 right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                       'key2': ['K0', 'K0', 'K0', 'K0'],
                       'C': ['C0', 'C1', 'C2', 'C3'],
                       'D': ['D0', 'D1', 'D2', 'D3']})

 pd.merge(left, right, on=['key1', 'key2'], how='inner')
 pd.merge(left, right, on=['key1', 'key2'], how='outer')
 pd.merge(left, right, on=['key1', 'key2'], how='left')
 pd.merge(left, right, on=['key1', 'key2'], how='right')


 df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
 df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
 pd.merge(df1, df2, on='col1', how='outer', indicator=True)
 '''
 #   col1 col_left  col_right      _merge
 # 0   0.0        a        NaN   left_only
 # 1   1.0        b        2.0        both
 # 2   2.0      NaN        2.0  right_only
 # 3   2.0      NaN        2.0  right_only
 '''
 pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
 '''
  col1 col_left  col_right indicator_column
 0   0.0        a        NaN        left_only
 1   1.0        b        2.0             both
 2   2.0      NaN        2.0       right_only
 3   2.0      NaN        2.0       right_only
 '''

 # merge by index
 left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                     index=['K0', 'K1', 'K2'])
 right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                      'D': ['D0', 'D2', 'D3']},
                     index=['K0', 'K2', 'K3'])
 pd.merge(left, right, left_index=True, right_index=True, how='outer')
 #      A    B    C    D
 # K0   A0   B0   C0   D0
 # K1   A1   B1  NaN  NaN
 # K2   A2   B2   C2   D2
 # K3  NaN  NaN   C3   D3

 pd.merge(left, right, left_index=True, right_index=True, how='inner')
 #     A   B   C   D
 # K0  A0  B0  C0  D0
 # K2  A2  B2  C2  D2

 boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
 girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
 #使用suffixes解决overlapping的问题
 pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
 '''
   age_boy   k  age_girl
 0        1  K0         4
 1        1  K0         5
 '''

 # draw
 import matplotlib.pyplot as plt
 data = pd.Series(np.random.randn(1000),index=np.arange(1000))
 data.cumsum()
 data.plot()
 plt.show()
	import numpy as np

	# linspace：创建线段

	arr = np.array([[1, 2, 3]
	[4, 5, 6]], dtype=int32) # list to matrix

	arr.ndim # 2 dimensions
	arr.shape # 2 * 3
	arr.size # number of elements

	np.zeros((3, 4), dtype=int) # 3*4

	np.ones((3, 4), dtype=int16) # 3*4

	np.empty((3, 4), dtype=int) # 3*4, close to 0

	a = np.arange(10, 20, 2)
	a[0] #10

	a = np.arange(10, 50, 2) # 10 - 20, step 2
	.reshape((4, 5)) # reshape to 3*4
	a[0][0] #10
	a[0, 0] #same
	a[0, 1:3] # [12, 14]

	np.linspace(1, 10, 20) # 1 - 10, 20 pieces
	.reshape((5, 4))

	# matrix operation
	a1 = np.arange(5) # array([0, 1, 2, 3, 4])
	a2 = np.arange(10, 15) # array([10, 11, 12, 13, 14])

	a2 - a1 # array([10, 10, 10, 10, 10])
	a2 + a1 # array([10, 12, 14, 16, 18])
	a2 * a1 # array([ 0, 11, 24, 39, 56])
	a1 / a2 # array([ 0. , 0.09090909, 0.16666667, 0.23076923, 0.28571429])
	a1 ** 2 # array([ 0, 1, 4, 9, 16])
	a1 < 0 # array([False, False, False, False, False], dtype=bool)
	np.sin(a1)

	a = np.array([[1,1],[0,1]])
	b = np.arange(4).reshape((2,2)) # [[0, 1], [2, 3]]
	np.dot(a, b) # array([[2, 4], [2, 3]])
	a.dot(b) # same
	np.dot([1,2,3],[4,5,6]) # 14 + 25 + 3*6 = 32

	a = np.random.random((2,4))
	np.sum(a)
	np.max(a)
	np.min(a)
	np.sum(a, axis=0) # sum by column
	np.sum(a, axis=1) # sum by row


	A = np.arange(2,14).reshape((3,4))
	# array([[ 2, 3, 4, 5]
	# [ 6, 7, 8, 9]
	# [10,11,12,13]])
	np.argmax(A) # index of 2: 0
	np.argmin(A) # indeox of 13: 11
	np.mean(A) # A.mean() # 7.5
	np.median(A) # A.median()
	np.median(A, axis=0)
	np.average(A) # 7.5
	np.cumsum(A) # [2, 2+3, 2+3+4, ...] => array([ 2, 5, 9, 14, 20, 27, 35, 44, 54, 65, 77, 90])
	np.diff(A) # [[3-2, 4-3, 5-4], ...] => array([[1, 1, 1], [1, 1, 1], [1, 1, 1]])
	np.nonzero(A) # position of nonzero: (0, 0), (0, 1) ...
	#(array([0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]),
	# array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]))

	A = np.arange(14, 2, -1)
	.reshape((3,4))
	# array([[14, 13, 12, 11],
	# [10, 9, 8, 7],
	# [ 6, 5, 4, 3]])
	np.sort(A)
	# array([[11,12,13,14]
	# [ 7, 8, 9,10]
	# [ 3, 4, 5, 6]])
	np.transpose(A)
	A.T # same
	# array([[14,10, 6]
	# [13, 9, 5]
	# [12, 8, 4]
	# [11, 7, 3]])
	(A.T).dot(A)
	# array([[332, 302, 272, 242],
	# [302, 275, 248, 221],
	# [272, 248, 224, 200],
	# [242, 221, 200, 179]])
	np.clip(A, 5, 9) # >9 => 9; <5 => 5
	# array([[ 9, 9, 9, 9]
	# [ 9, 9, 8, 7]
	# [ 6, 5, 5, 5]])

	a = np.arange(4).reshape((2, 2))
	a[0, 0] # 0
	for row in a:
	print(row)

	for column in a.T:
	print(column)

	a.flatten() # array([0, 1, 2, 3])
	for item in a.flat: # iterate items
	print(item)


	a = np.array([1, 1, 1])
	b = np.array([2, 2, 2])
	np.vstack((a, b)) # array([[1, 1, 1], [2, 2, 2]])
	np.hstack((a, b)) # array([1, 1, 1, 2, 2, 2])
	a[:, np.newaxis] # array([[1], [1], [1]])
	# using concatenate
	np.concatenate((a, b, b), axis=0) # 0:vertical 1:horizontal


	a = np.arange(12).reshape((3, 4))
	np.hsplit(a, 2)
	np.split(a, 2, axis=1) # horizontal
	# [ array([[0, 1], [4, 5], [8, 9]]),
	# array([[2, 3], [6, 7], [10, 11]])]
	np.vsplit(a, 3)
	np.split(a, 3, axis=0) # vertically
	# [array([[0, 1, 2, 3]]), array([[4, 5, 6, 7]]), array([[8, 9, 10, 11]])]
	np.array_split(A, 3, axis=1) # uneven split
	# [ array([[0, 1],[4, 5],[8, 9]]),
	# array([[2], [6], [10]]),
	# array([[3], [7], [11]])]

	a = np.arange(4)
	b = a
	a is b # True
	c = a.copy()
	c is a # False



	#pandas
	import pandas as pd
	s = pd.Series([1, 3, np.nan, 5])
	# 0 1.0
	# 1 3.0
	# 2 NaN
	# 3 5.0
	# dtype: float64

	dates = pd.date_range('20170101', periods=6)
	# DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04', '2017-01-05', '2017-01-06'],
	# dtype='datetime64[ns]', freq='D')

	df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['a', 'b', 'c', 'd'])
	# a b c d
	# 2017-01-01 -0.669733 0.091818 0.581845 -0.290370
	# 2017-01-02 0.203958 -0.840011 -1.234419 1.567374
	# 2017-01-03 0.761231 -0.712473 0.954426 2.002349
	# 2017-01-04 0.477278 0.860596 0.867349 0.438903
	# 2017-01-05 -1.431947 0.684325 -0.762821 0.815071
	# 2017-01-06 -0.095380 -0.515609 0.184032 -0.482174

	pd.DataFrame(np.arange(12).reshape((3, 4)))
	# 0 1 2 3
	# 0 0 1 2 3
	# 1 4 5 6 7
	# 2 8 9 10 11

	df2 = pd.DataFrame({'A': 1.,
	'B': pd.Timestamp('20130102'),
	'C': pd.Series(1, index=list(range(4)), dtype='float32'),
	'D': np.arange(4),
	'E': pd.Categorical(['test', 'train', 'test', 'train']),
	'F': 'foo'})
	# A B C D E F
	# 0 1.0 2013-01-02 1.0 0 test foo
	# 1 1.0 2013-01-02 1.0 1 train foo
	# 2 1.0 2013-01-02 1.0 2 test foo
	# 3 1.0 2013-01-02 1.0 3 train foo
	df2.type
	df2.index
	df2.columns
	df2.rows
	df2.values
	df2.describe()
	df2.T
	df2.sort_index(axis=1, ascending=False) # sort by column name
	df2.sort_values(by='B') # sort by B column value

	# data selection
	dates = pd.date_range('20130101', periods=6)
	df = pd.DataFrame(np.arange(24).reshape((6,4)),
	index=dates,
	columns=['A','B','C','D'])

	'''
	A B C D
	2013-01-01 0 1 2 3
	2013-01-02 4 5 6 7
	2013-01-03 8 9 10 11
	2013-01-04 12 13 14 15
	2013-01-05 16 17 18 19
	2013-01-06 20 21 22 23
	'''

	df['A']
	df.A
	'''
	2013-01-01 0
	2013-01-02 4
	2013-01-03 8
	2013-01-04 12
	2013-01-05 16
	2013-01-06 20
	Freq: D, Name: A, dtype: int64
	'''

	df[0:3]
	'''
	A B C D
	2013-01-01 0 1 2 3
	2013-01-02 4 5 6 7
	2013-01-03 8 9 10 11
	'''

	df['20130102':'20130104']
	'''
	A B C D
	2013-01-02 4 5 6 7
	2013-01-03 8 9 10 11
	2013-01-04 12 13 14 15
	'''

	df.loc['20130102']
	'''
	A 4
	B 5
	C 6
	D 7
	Name: 2013-01-02 00:00:00, dtype: int64
	'''

	df.loc[:,['A','B']]
	'''
	A B
	2013-01-01 0 1
	2013-01-02 4 5
	2013-01-03 8 9
	2013-01-04 12 13
	2013-01-05 16 17
	2013-01-06 20 21
	'''

	df.loc['20130102',['A','B']]
	'''
	A 4
	B 5
	Name: 2013-01-02 00:00:00, dtype: int64
	'''

	df.iloc[3,1] # 13
	df.iloc[3:5, 1:3]
	'''
	B C
	2013-01-04 13 14
	2013-01-05 17 18
	'''
	df.iloc[[1,3,5],1:3]
	'''
	B C
	2013-01-02 5 6
	2013-01-04 13 14
	2013-01-06 21 22
	'''

	df.ix[:3,['A','C']]
	'''
	A C
	2013-01-01 0 2
	2013-01-02 4 6
	2013-01-03 8 10
	'''

	# Boolean indexing:
	df[df.A>8]
	'''
	A B C D
	2013-01-04 12 13 14 15
	2013-01-05 16 17 18 19
	2013-01-06 20 21 22 23
	'''


	dates = pd.date_range('20130101', periods=6)
	df = pd.DataFrame(np.arange(24).reshape((6, 4)),
	index=dates,
	columns=['A', 'B', 'C', 'D'])

	df.iloc[2, 2] = 111
	df.loc['20130101', 'B'] = 222
	'''
	A B C D
	2013-01-01 0 222 2 3
	2013-01-02 4 5 6 7
	2013-01-03 8 9 111 11
	2013-01-04 12 13 14 15
	2013-01-05 16 17 18 19
	2013-01-06 20 21 22 23
	'''
	df.B[df.A>4] = 0
	'''
	A B C D
	2013-01-01 0 2222 2 3
	2013-01-02 4 5 6 7
	2013-01-03 8 0 1111 11
	2013-01-04 12 0 14 15
	2013-01-05 16 0 18 19
	2013-01-06 20 0 22 23
	'''
	df['F'] = np.nan
	'''
	A B C D F
	2013-01-01 0 222 2 3 NaN
	2013-01-02 4 5 6 7 NaN
	2013-01-03 8 0 111 11 NaN
	2013-01-04 12 0 14 15 NaN
	2013-01-05 16 0 18 19 NaN
	2013-01-06 20 0 22 23 NaN
	'''
	df['E'] = pd.Series([1,2,3,4,5,6],
	index=pd.date_range('20130101',
	periods=6))
	'''
	A B C D F E
	2013-01-01 0 2222 2 3 NaN 1
	2013-01-02 4 5 6 7 NaN 2
	2013-01-03 8 0 1111 11 NaN 3
	2013-01-04 12 0 14 15 NaN 4
	2013-01-05 16 0 18 19 NaN 5
	2013-01-06 20 0 22 23 NaN 6
	'''

	dates = pd.date_range('20130101', periods=6)
	df = pd.DataFrame(np.arange(24).reshape((6, 4)),
	index=dates,
	columns=['A', 'B', 'C', 'D'])
	df.iloc[0, 1] = np.nan
	df.iloc[1, 2] = np.nan
	'''
	A B C D
	2013-01-01 0 NaN 2.0 3
	2013-01-02 4 5.0 NaN 7
	2013-01-03 8 9.0 10.0 11
	2013-01-04 12 13.0 14.0 15
	2013-01-05 16 17.0 18.0 19
	2013-01-06 20 21.0 22.0 23
	'''
	df.dropna(
	axis=0, # 0: 对行进行操作; 1: 对列进行操作
	how='any' # 'any': 只要存在 NaN 就 drop 掉; 'all': 必须全部是 NaN 才 drop
	)
	'''
	A B C D
	2013-01-03 8 9.0 10.0 11
	2013-01-04 12 13.0 14.0 15
	2013-01-05 16 17.0 18.0 19
	2013-01-06 20 21.0 22.0 23
	'''
	df.fillna(value=0)
	'''
	A B C D
	2013-01-01 0 0.0 2.0 3
	2013-01-02 4 5.0 0.0 7
	2013-01-03 8 9.0 10.0 11
	2013-01-04 12 13.0 14.0 15
	2013-01-05 16 17.0 18.0 19
	2013-01-06 20 21.0 22.0 23
	'''
	df.isnull()
	'''
	A B C D
	2013-01-01 False True False False
	2013-01-02 False False True False
	2013-01-03 False False False False
	2013-01-04 False False False False
	2013-01-05 False False False False
	2013-01-06 False False False False
	'''
	np.any(df.isnull()) # if exists nan


	# read
	data = pd.read_csv('students.csv')
	# to pickle
	data.to_pickle('student.pickle')

	# concat
	# axis, default 0
	df1 = pd.DataFrame(np.ones((3, 4))*0, columns=['a', 'b', 'c', 'd'])
	df2 = pd.DataFrame(np.ones((3, 4))*1, columns=['a', 'b', 'c', 'd'])
	df3 = pd.DataFrame(np.ones((3, 4))*2, columns=['a', 'b', 'c', 'd'])
	pd.concat([df1, df2, df3], axis=0)
	'''
	a b c d
	0 0.0 0.0 0.0 0.0
	1 0.0 0.0 0.0 0.0
	2 0.0 0.0 0.0 0.0
	0 1.0 1.0 1.0 1.0
	1 1.0 1.0 1.0 1.0
	2 1.0 1.0 1.0 1.0
	0 2.0 2.0 2.0 2.0
	1 2.0 2.0 2.0 2.0
	2 2.0 2.0 2.0 2.0
	'''
	pd.concat([df1, df2, df3], axis=0, ignore_index=True)
	'''
	a b c d
	0 0.0 0.0 0.0 0.0
	1 0.0 0.0 0.0 0.0
	2 0.0 0.0 0.0 0.0
	3 1.0 1.0 1.0 1.0
	4 1.0 1.0 1.0 1.0
	5 1.0 1.0 1.0 1.0
	6 2.0 2.0 2.0 2.0
	7 2.0 2.0 2.0 2.0
	8 2.0 2.0 2.0 2.0
	'''

	df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'], index=[1,2,3])
	df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'], index=[2,3,4])
	pd.concat([df1, df2], axis=0, join='outer')
	'''
	a b c d e
	1 0.0 0.0 0.0 0.0 NaN
	2 0.0 0.0 0.0 0.0 NaN
	3 0.0 0.0 0.0 0.0 NaN
	2 NaN 1.0 1.0 1.0 1.0
	3 NaN 1.0 1.0 1.0 1.0
	4 NaN 1.0 1.0 1.0 1.0
	'''

	pd.concat([df1, df2], axis=0, join='inner')
	'''
	b c d
	1 0.0 0.0 0.0
	2 0.0 0.0 0.0
	3 0.0 0.0 0.0
	2 1.0 1.0 1.0
	3 1.0 1.0 1.0
	4 1.0 1.0 1.0
	'''

	pd.concat([df1, df2], axis=0, join='inner', ignore_index=True)
	'''
	b c d
	0 0.0 0.0 0.0
	1 0.0 0.0 0.0
	2 0.0 0.0 0.0
	3 1.0 1.0 1.0
	4 1.0 1.0 1.0
	5 1.0 1.0 1.0
	'''

	# horizontal by index
	df1 = pd.DataFrame(np.ones((3, 4)),
	columns=['A', 'B', 'C', 'D'],
	index=[1, 2, 3])
	df2 = pd.DataFrame(np.ones((3, 4)),
	columns=['A', 'B', 'C', 'D']
	index=[2, 3, 4])
	pd.concat([df1, df2], axis=1, join_axes=[df1.index])

	# a b c d b c d e
	# 1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
	# 2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
	# 3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0


	df1 = pd.DataFrame(np.ones((3, 4)) * 0,
	columns=['A', 'B', 'C', 'D'])
	df2 = pd.DataFrame(np.ones((3, 4)) * 1,
	columns=['A', 'B', 'C', 'D'])
	df3 = pd.DataFrame(np.ones((3, 4)) * 1,
	columns=['A', 'B', 'C', 'D'])

	df1.append(df2, ignore_index=True)
	'''
	a b c d
	0 0.0 0.0 0.0 0.0
	1 0.0 0.0 0.0 0.0
	2 0.0 0.0 0.0 0.0
	3 1.0 1.0 1.0 1.0
	4 1.0 1.0 1.0 1.0
	5 1.0 1.0 1.0 1.0
	'''

	s1 = pd.Series([1,2,3,4],
	index=['a','b','c','d'])

	df1.append(s1, ignore_index=True)
	'''
	# a b c d
	# 0 0.0 0.0 0.0 0.0
	# 1 0.0 0.0 0.0 0.0
	# 2 0.0 0.0 0.0 0.0
	# 3 1.0 2.0 3.0 4.0
	'''


	left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
	'A': ['A0', 'A1', 'A2', 'A3'],
	'B': ['B0', 'B1', 'B2', 'B3']})
	right = pd.DataFrame({'key': ['K1', 'K2', 'K3', 'K4'],
	'C': ['C0', 'C1', 'C2', 'C3'],
	'D': ['D0', 'D1', 'D2', 'D3']})
	'''
	A B key
	0 A0 B0 K0
	1 A1 B1 K1
	2 A2 B2 K2
	3 A3 B3 K3
	'''
	'''
	C D key
	0 C0 D0 K1
	1 C1 D1 K2
	2 C2 D2 K3
	3 C3 D3 K4
	'''
	pd.merge(left, right, on='key')
	'''
	A B key C D
	0 A1 B1 K1 C0 D0
	1 A2 B2 K2 C1 D1
	2 A3 B3 K3 C2 D2
	'''

	left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
	'key2': ['K0', 'K1', 'K0', 'K1'],
	'A': ['A0', 'A1', 'A2', 'A3'],
	'B': ['B0', 'B1', 'B2', 'B3']})
	right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
	'key2': ['K0', 'K0', 'K0', 'K0'],
	'C': ['C0', 'C1', 'C2', 'C3'],
	'D': ['D0', 'D1', 'D2', 'D3']})

	pd.merge(left, right, on=['key1', 'key2'], how='inner')
	pd.merge(left, right, on=['key1', 'key2'], how='outer')
	pd.merge(left, right, on=['key1', 'key2'], how='left')
	pd.merge(left, right, on=['key1', 'key2'], how='right')


	df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
	df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
	pd.merge(df1, df2, on='col1', how='outer', indicator=True)
	'''
	# col1 col_left col_right _merge
	# 0 0.0 a NaN left_only
	# 1 1.0 b 2.0 both
	# 2 2.0 NaN 2.0 right_only
	# 3 2.0 NaN 2.0 right_only
	'''
	pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
	'''
	col1 col_left col_right indicator_column
	0 0.0 a NaN left_only
	1 1.0 b 2.0 both
	2 2.0 NaN 2.0 right_only
	3 2.0 NaN 2.0 right_only
	'''

	# merge by index
	left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
	'B': ['B0', 'B1', 'B2']},
	index=['K0', 'K1', 'K2'])
	right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
	'D': ['D0', 'D2', 'D3']},
	index=['K0', 'K2', 'K3'])
	pd.merge(left, right, left_index=True, right_index=True, how='outer')
	# A B C D
	# K0 A0 B0 C0 D0
	# K1 A1 B1 NaN NaN
	# K2 A2 B2 C2 D2
	# K3 NaN NaN C3 D3

	pd.merge(left, right, left_index=True, right_index=True, how='inner')
	# A B C D
	# K0 A0 B0 C0 D0
	# K2 A2 B2 C2 D2

	boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
	girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
	#使用suffixes解决overlapping的问题
	pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
	'''
	age_boy k age_girl
	0 1 K0 4
	1 1 K0 5
	'''

	# draw
	import matplotlib.pyplot as plt
	data = pd.Series(np.random.randn(1000),index=np.arange(1000))
	data.cumsum()
	data.plot()
	plt.show()
No results found