Importing Data
- Core packages
- pandas — data analysis and manipulation library
- numpy — fundamental package for scientific computing
- requests — HTTP (protocol) library
- Beautiful Soup — HTML and XML document processing library
- sqlalchemy — SQL toolkit and Object Relational Mapper (ORM)
- picle — implements binary protocols for serializing and de-serializing
Flat Files – Python I/O
def import_text(file_path):
""" read entire text file into an object """
with open(file_path) as file:
data = file.readlines()
return data
def import_text_chunk(file_object, chunk_size=1000):
""" lazy load file piece by piece using generator function
usage: as a reader function with open() file """
while True:
data = file_object.read(chunk_size)
if not data:
break
yield data
Flat Files – NumPy
import numpy as np
def import_numeric_text(file_path, delimiter, skiprows=0):
""" load numeric data into numpy series """
return np.loadtxt(file_path, delimiter, skiprows)
def import_mixed_type_text(file_path, delimiter, has_header=True, dtype=None):
""" load mix data type file into numpy structured array """
return np.genfromtxt(file_path, delimiter=',', names=has_header, dtype=dtype)
def import_mixed_type_text_csv(file_path):
""" load mixed data type csv file into numpy structured array """
return np.recfromcsv(file_path)
Flat Files – Pandas
import pandas as pd
def import_cvs_df(file_path, delimiter=','):
""" import a csv file into pandas DataFrame """
return pd.read_csv(file_path, sep=delimiter)
def import_corrupted_csv_df(file_path, delimiter=',', comment="#", na_values='Nothing'):
""" import corruptred data with a delimeter, comment = chars that comments occur after,
na_values is a list of strings to recognize as NA/ NaN """
return pd.read_csv(file_path, sep=delimiter, comment=comment, na_values=na_values)
Binary Files – Pickle
import pickle
def import_bin_file(file_path):
""" import content of a pickled file """
with open(file_path, 'rb') as file:
data = pickle.load(file)
return data
Excel Files – Pandas
import pandas as pd
def import_excel_file_df(file_path):
""" import content of an excel file as a pandas DataFrame """
return pd.ExcelFile(file_path)
def import_excel_worksheet_df(file_path, worksheet, skiprows=None):
""" import content of an excel worksheet as a pandas DataFrame
worksheet can be an index or a name """
return pd.ExcelFile(file_path).parse(worksheet, skiprows=skiprows)
SAS/ Stata Files (sas7bdat, dta)
from bs4 import BeautifulSoup
from sas7bdat import SAS7BDAT
import pandas as pd
def import_sas_file_df(file_path):
""" import content of a sas file as a pandas DataFrame """
with SAS7BDAT(file_path) as file:
df = file.to_data_frame()
return df
def import_dta_file_df(file_path):
""" import content of a stata file as a pandas DataFrame """
return pd.read_stata(file_path)
HDF5 Files – [meta, quality, strain]
import h5py
def import_hdf5_file(file_path):
""" import content of a HDF5 file """
return h5py.File(file_path, 'r')
MATLAB Files – [matrix laboratory; .mat]
from scipy import misc
def import_matlab_file(file_path):
""" import content of a MATLAB file """
return scipy.io.loadmat(file_path)
Relational Databases – SQLite
from sqlalchemy import create_engine
import pandas as pd
def query_sqlite_df1(con_str, sql_query):
""" import result of a sql query as a pandas DataFrame, long """
engine = create_engine('sqlite:///'+con_str)
con = engine.connectr()
rs = con.execute(sql_query)
df = pd.DataFrame(rs.fetchall())
df.columns = rs.keys()
con.close()
return df
def query_sqlite_df2(con_str, sql_query):
""" import result of a sql query as a pandas DataFrame, short """
engine = create_engine('sqlite:///'+con_str)
return pd.read_sql_query(sql_query, engine)
Web files, html, parsed html text – urlretrieve
from urllib.request import urlretrieve
from bs4 import BeautifulSoup
import pandas as pd
def import_and_save_web_file_df(url, local_file_path):
""" import web (csv) file locally and as a pandas DataFrame """
return pd.DataFrame(urlretrieve(url, local_file_path))
def import_web_file_df(url, delimiter):
""" import web (delimited) file as a pandas DataFrame """
return pd.DataFrame(pd.read_csv(url, sep=delimiter))
def import_web_excel_file_df(url):
""" import web excel file as a pandas DataFrame """
return pd.DataFrame(pd.read_excel(url, sheetname=None))
def import_web_excel_worksheet_df(url, sheet_name):
""" import web excel file (a single worksheet) as a pandas DataFrame """
return pd.DataFrame(pd.read_excel(url, sheetname=None)[sheet_name])
def import_from_url_html(url):
""" import html from a web request """
return requests.get(url)
def import_from_url_html_formatted(url):
""" import html from a web request """
return BeautifulSoup(requests.get(url).text).prettify()
def import_from_url_clean_text(url):
""" import clean text from a web request """
return BeautifulSoup(requests.get(url).text).get_text()
def import_from_url_text_links(url):
""" import links (as a text) from a web request """
return BeautifulSoup(requests.get(url).text).find_all('a').get('href')
WebAPI (json)
import requests
def import_from_url_json(url):
return requests.get(url).json()
Cleaning Data
- Core packages
- pandas — data analysis and manipulation library
- numpy — fundamental package for scientific computing
- matplotlib — comprehensive library for creating static, animated, and interactive visualizations
Basic Pattern Matching
import glob
import re
# Globbing - pattern matching for file names
, wildcards allowed e.g. *.csv
csv_files = glob.glob('*.csv')
print(cvs_files)
# RegEx
pattern1 = re.compile(r'\$\d*\.\d{2}')
result = pattern1.match('$17.89')
print(bool(result))
pattern2 = re.findall(r'\d+', 'the recipe calls for 10 strawberries and 1 banana')
print(pattern2)
# List comprehensions
num_list = [num ** 2 for num in range(1000) if num % 2 == 0]
print(num_list)
Inspecting pandas DataFrame
import pandas as pd
df = pd.read_csv('http://samplecsvs.s3.amazonaws.com/Sacramentorealestatetransactions.csv')
# top 5 and bottom 5 rows data sample
df.head()
df.tail()
# column names e.g. it clearly displays leading or tailing spaces
df.columns
# number of rows and columns in the dataset
df.shape
# index range, memory usage, column data types and number of missing values per column
df.info()
Exploratory Data Analysis
# frequency count [syntax --- df.column.value_counts() or ds[col_index or name].value_counts(); dropna=False --
# count missing values]
print(df.city.value_counts(dropna=False))
# summary statistics
print(df.describe())
Visualizing Data – spotting outliers
# Visualization 101
# 1. Bar plots for discrete data counts
# 2. Histograms for continuous data counts
# 3. Box plots - visualize basic summary statistics
# 4. Scatter plots - find relationship between 2 numeric variables
df.price.plot('hist')
plt.show()
# slicing data - using list comprehensions
df[df.price > 100000]
# box plot is for numeric value over some categories
# e.g. real estate price by city
df.boxplot(column="price", by="city")
plt.show()
# scatter plot are ideal for visualizing two numeric columns
df.plot(kind='scatter', x='initial_cost', y='total_est_fee', rot=70)
plt.show()
Tidy Data
# formalizes thw way we describe the shape of data
# gives us a goal when formatting our data
# standard way to organize data values withing a dataset
# 1. columns represent separate variables
# 2. rows represent individual observations
# 3. observational units from tables
# tidy data makes it easier to fix common data problems
# pl.melt() - convert columns into variables -- make it untidy
# melting data is the process of turning columns into rows
# not always a good idea to do this...
pd.melt(df, id_vars=['Month', 'Day'],
var_name='measurement', value_name='reading')
# pivoting - opposite to melting i.e. turning rows into columns
# turning unique rows into columns dataset.pivot()
# pivot table - has a way to deal with duplicate data points as regular pivot would fail
# turn column measurement and value into pivot table [columns], multi-index;
pivot_dataset = df.pivot_table(
index=['Month', 'Day'], columns='measurement', values='reading')
# pivoting gives hierarchical index aka multi index data frame; to turn it back to original df use reset_index()
# pivot with aggregate function - np.mean to get rid of duplicates
pivot_dataset2 = df.pivot_table(
index=['Month', 'Day'], columns='measurement', values='reading', aggfunc=np.mean)
# extracting columns by slicing
pivot_dataset2['gender'] = pivot_dataset2.variable.str[0]
pivot_dataset2['age_group'] = pivot_dataset2.variable.str[1:]
# checking data types
# df.dtypes then df[col].to_numeric() etc.
# combining/ concatenating data sets pandas concat i.e. pd.concat(set1, set 2) -- keep in mind that row index stays
# as per the original !!! multiple 0, 1,2,3 etc. use ignore_index = true to reset index of the new data frame use
# axis=1 to concat columns of data, axis=0 for rows of data
# merging data - similar to SQL merge
# pd.merge(left=, right=, on=, left_on=, right_on=) -- use on= or left_on= + right_on = to specify the keys
# merge types: one to one, one-to-many, many-to-many (duplicate keys)
site = pd.DataFrame()
visited = pd.DataFrame()
o2o = pd.merge(left=site, right=visited, left_on='name', right_on='site')
# column type casting; e.g. categorical column/ variables reduce the size of the data frame
tips = pd.DataFrame()
tips.sex = tips.sex.astype('category')
# convert to numeric column the 'total_bill' and 'tip' columns in this DataFrame are stored as object types because
# the string 'missing' is used in these columns to encode missing values. by coercing the values into a numeric type,
# they become proper NaN values.
pd.to_numeric(tips['total_bill'], errors='coerce')
# applying functions to columns - pd.apply(function_name, axis=, pattern=)
def decode_gender(gender):
if gender == 'Female':
return 0
elif gender == 'Male':
return 1
else:
return np.nan
tips['decode'] = tips.sex.apply(decode_gender, )
# or using lambda
tips['decode-female'] = tips.sex.apply(lambda x: x.replace('Female', True), )
Missing Data
# df.into() -- gives basic info including missing values, data types etc.
# dropping duplicates - df.drop_duplicates()
# NaN -- leave, drop or fill missing values
# drop missing values in entire data frame df.dropna() -- dropping rows (be careful)
# fill missing values by column - df[col].fillna('missing') --- can be applied to multiples col @ once
# fill missing values with statistical data (instead of 0s) etc.
# testing with assert() -- takes a predicate and evaluates to a bool
# e.g. assert pd.notnull(ebola).all().all() -- the first .all() method will return a True or False for each column,
# while the second .all() method will return a single True or False.
Like this:
Like Loading...