Basic Python Pandas

import pandas as pd     # for dataframes

Date Format Read Save
csv pd.read_csv() df.to_csv()
json pd.read_json() df.to_json()
excel pd.read_excel() df.to_excel()
hdf pd.read_hdf() df.to_hdf()
sql pd.read_sql() df.to_sql()

df = pd.read_csv('folder/file.csv')
     # import csv
xlsx = ('folder/file.xlsx')
df = pd.read_excel(xlsx) 
   # import excel file
df.head()     # view first 5 lines
df.columns    # columns names
df.describe(include = "all")    # summary of each column (without  
# include, will exclude NaN)
df.info
df['Col1']     # view column Col1 as a series
df[['Col1']]     # view column Col1 as a data frame (with header and row 
# numbers)
y = df[['Col2','Col3','Col4']]     # create a new data frame with 3  
# columns of the original one
df['NewColumn'] = [1,0,...]       # add column "NewColumn"
df.loc['ID']      # select only the column with the label "ID" (loc function  
# is "label-based")
df.loc[['ID']]      # same result, but the single bracket version gives a 
# Pandas Series, the double bracket version gives a Pandas DataFrame.
df.loc[["IDrow","IDrow2"]]     # Pandas DataFrame with 2 rows. Ok 
# only with the double brackets.
df.loc["IDrow", "ColumnName"]     # select only the element of the row  
# "IDrow" in the column "ColumnName"
df.loc["IDrow"]["ColumnName"]       # same result
df.loc["ColumnName"].loc["IDrow"]      # same result
df.loc[["IDrow","IDrow2"],["ColumnName", "ColumnName2"]] 
#  Pandas DataFrame with 2 rows and 2 columns.
df.iloc[0, 2]     # access value in the first raw and third column
df.iloc[0:2, 0:3]     # first two raws and first three columns
avg_col1 = df['Col1'].astype('float').mean(axis=0)
# average for Col1
df.dtypes    # data types for each column
df[['Col1']] = df[['Col1']].astype('float')    # convert Col1 to 
# float
df.rename(columns={'Col1':'Col2'}, inplace=True)    # rename 
# Col1 in Col2
df.replace("?", np.nan, inplace = True)    # replace "?" to NaN
df.dropna()    # delete record with one or more Nan
df.isnull()    # true if the cell is null

# create bins

bins = np.linspace(min(df['Col1']), max(df['Col1']), 4)    
# create three equal bins
group_names = ['Low', 'Medium', 'High']
df['Col1_bin'] = pd.cut(df['Col1'], bins, labels=group_names, include_lowest=True)
df['Col1_bin].value_counts()
    # values for each bin

# dummy variables

dummy_1 = pd.get_dummies(df['Col1'])    # if Col1 is a dummy, split 
# to two columns (0,1)


No comments:

Post a Comment