Date Format | Read | Save |
---|---|---|
csv | pd.read_csv() | df.to_csv() |
json | pd.read_json() | df.to_json() |
excel | pd.read_excel() | df.to_excel() |
hdf | pd.read_hdf() | df.to_hdf() |
sql | pd.read_sql() | df.to_sql() |
df = pd.read_csv('folder/file.csv') # import csv
df = pd.read_excel(xlsx) # import excel file
df.head() # view first 5 lines
df.columns # columns names
df.describe(include = "all") # summary of each column (without
# include, will exclude NaN)
df.info
df['Col1'] # view column Col1 as a series
df[['Col1']] # view column Col1 as a data frame (with header and row
df.info
df['Col1'] # view column Col1 as a series
df[['Col1']] # view column Col1 as a data frame (with header and row
# numbers)
y = df[['Col2','Col3','Col4']] # create a new data frame with 3
y = df[['Col2','Col3','Col4']] # create a new data frame with 3
# columns of the original one
df['NewColumn'] = [1,0,...] # add column "NewColumn"
df.loc['ID'] # select only the column with the label "ID" (loc function
# is "label-based")
df.loc[['ID']] # same result, but the single bracket version gives a
# Pandas Series, the double bracket version gives a Pandas DataFrame.
df.loc[["IDrow","IDrow2"]] # Pandas DataFrame with 2 rows. Ok
# only with the double brackets.
df.loc["IDrow", "ColumnName"] # select only the element of the row
df.loc["IDrow", "ColumnName"] # select only the element of the row
# "IDrow" in the column "ColumnName"
df.loc["IDrow"]["ColumnName"] # same result
df.loc["ColumnName"].loc["IDrow"] # same result
df.loc[["IDrow","IDrow2"],["ColumnName", "ColumnName2"]]
df.loc["IDrow"]["ColumnName"] # same result
df.loc["ColumnName"].loc["IDrow"] # same result
df.loc[["IDrow","IDrow2"],["ColumnName", "ColumnName2"]]
# Pandas DataFrame with 2 rows and 2 columns.
df.iloc[0, 2] # access value in the first raw and third column
df.iloc[0:2, 0:3] # first two raws and first three columns
avg_col1 = df['Col1'].astype('float').mean(axis=0)
df.iloc[0:2, 0:3] # first two raws and first three columns
avg_col1 = df['Col1'].astype('float').mean(axis=0)
# average for Col1
df.dtypes # data types for each column
df[['Col1']] = df[['Col1']].astype('float') # convert Col1 to
df.dtypes # data types for each column
df[['Col1']] = df[['Col1']].astype('float') # convert Col1 to
# float
df.rename(columns={'Col1':'Col2'}, inplace=True) # rename
df.rename(columns={'Col1':'Col2'}, inplace=True) # rename
# Col1 in Col2
df.replace("?", np.nan, inplace = True) # replace "?" to NaN
df.dropna() # delete record with one or more Nan
df.isnull() # true if the cell is null
df.isnull() # true if the cell is null
# create bins
bins = np.linspace(min(df['Col1']), max(df['Col1']), 4)# create three equal bins
group_names = ['Low', 'Medium', 'High']
df['Col1_bin'] = pd.cut(df['Col1'], bins, labels=group_names, include_lowest=True)
df['Col1_bin].value_counts() # values for each bin
group_names = ['Low', 'Medium', 'High']
df['Col1_bin'] = pd.cut(df['Col1'], bins, labels=group_names, include_lowest=True)
df['Col1_bin].value_counts() # values for each bin
# dummy variables
dummy_1 = pd.get_dummies(df['Col1']) # if Col1 is a dummy, split# to two columns (0,1)
No comments:
Post a Comment