Data Ingestion with pandas

# 1.  Importing Data from Flat Files and Spreadsheets

# read_csv for all flat files
import pandas as pd
data = pd.read_csv('file.csv')
data = pd.read_csv('file.tsv', sep='\t')

# choose columns to load by name
col_names = ['STATEFIPS', 'STATE', 'zipcode', 'agi_stub', 'N1']
data = pd.read_csv('file.csv', usecols=col_names)

# choose columns to load by number
col_nums = [0, 1, 2, 3, 4]
data = pd.read_csv('file.csv', usecols=col_nums)

# limiting rows, skip rows, no header in file
data = pd.read_csv('file.csv', nrows=1000, skiprows=1000, header=None)

# assigning column names
colnames = list(another_dataframe)
data = pd.read_csv('file.csv', header=None, names=colnames)

# specifyinf data types
data = pd.read_csv('file.csv', dtype={'zipcode': str})

# customizing missing data values (0 is NaN in the example)
data = pd.read_csv('file.csv', na_values={'zipcode' : 0})

# lines with errors
# with error_bad_lines skip lines with error
# with warn_bad_lines return a warning for bad lines
data = pd.read_csv('file.csv', error_bad_lines=False, warn_bad_lines=True)

# read_excel for excel files
data = pd.read_excel('file.xlsx')   # reads only the first sheet

# many parameters in common with read_csv
# nrows
# skiprows
# usecols: you can specify a range of letters of the spreadsheet file
# in the example I import only columns from W to AB plus AR
data = pd.read_excel('file.xlsx', usecols="W:AB, AR")

# two possibilities for taking the second sheet named sheet2
data = pd.read_excel('file.xlsx', sheet_name=1)   # zero-indexed
data = pd.read_excel('file.xlsx', sheet_name='sheet2')

# import first two sheets
data = pd.read_excel('file.xlsx', sheet_name=[0,1])   
data = pd.read_excel('file.xlsx', sheet_name=['sheet1','sheet2'])

# take all the sheets of a file (one data frame per sheet)
data = pd.read_excel('file.xlsx', sheet_name=None)

# with dtype specify column types
# true_values and false_values useful for convert boolean
data = pd.read_excel('file.xlsx',
                                    dtype = {'col5': bool,
                                                   'col6': bool}
                                    true_values = ["Yes"]
                                    false_values = ["No"]})
# NB: blank values are setting as True!

# for setting datetime columns not dtypes but parse_dates
# if is a standard datetime is enough this:
data = pd.read_excel('file.xlsx',
                                    parse_dates = ['col1','col2'])

# for non standard column date, es. 311299 for 31/12/1999
format_string = '%d%m%y'


# 2.  Importing Data from Databases

import pandas as pd

# library to connect to databases
from sqlalchemy import create_engine

# create the database engine for the db data.db on sqlite
engine = create_engine('sqlite:///data.db')

# view the tables in the database
print(engine.table_names())

# load tab table without any SQL
tabloaded = pd.read_sql('tab', engine)

# create a SQL query to load data from a query
query = 'SELECT col1, col2 FROM tab WHERE col1 > 0'

# load weather with the SQL query
tabloaded = pd.read_sql(query, engine)


# 3.  Importing JSON Data and Working with APIs

import pandas as pd

# load a JSON file to a data frame
jsondata = pd.read_json('file.json')

# we can specify the orientation of the JSON 
# 'split' : dict like {index -> [index], columns -> [columns], data -> [values]}
# 'records' : list like [{column -> value}, ... , {column -> value}]
# 'index' : dict like {index -> {column -> value}}
# 'columns' : dict like {column -> {index -> value}}
# 'values' : just the values array
jsondata = pd.read_json('file.json', orient = 'split')

# import data from an API
# requests library used for any url
import requests

# example: using the Yelp API
api_url = "https://api.yelp.com/v3/businesses/search"

# set up parameter dictionary according to documentation
params = {"term": "bookstore", "location": "San Francisco"}

# set up header dictionary w/ API key according to documentation
headers = {"Authorization": "Bearer {}".format(api_key)}

# call the API
response = requests.get(api_url, params=params ,headers=headers)

# isolate the JSON data from the response object
data = response.json()

# load businesses data to a data frame
df = pd.DataFrame(data["businesses"])

# library for reading nested json
from pandas.io.json import json_normalize

# flatten data and load to data frame, with _ separators
df = json_normalize(data["businesses"], sep="_")

# flatten categories data, bring in business details
df = json_normalize(data["businesses"], sep="_", record_path="categories",
     meta=["name", "alias", "rating", 
               ["coordinates", "latitude"],["coordinates", "longitude"]],
                     meta_prefix="biz_")


# 4.  Combining multiple datasets

import pandas as pd

# append datasets method
appended = df1.append(df2)

# set ignore_index to True to renumber rows
appended = df1.append(df2, ignore_index = True)

# merge(): pandas function and dataframe method for merging datasets
# default is like sql inner join (only values in both dataframes) 
merged = df1.merge(df2, on = 'col1')
merged = df1.merge(df2, left_on = 'col1', right_on = 'col2')



No comments:

Post a Comment