# Max Smith, March 20, 2016
# mcsmith12@earlham.edu
import numpy as np
import pandas as pd
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
%matplotlib inline
handle = open('elpaso.csv','r')
dta = pd.DataFrame.from_csv(handle)
# Now get rid of the bad rows
dta = dta.drop(['TEXAS', 'Offenses Known to Law Enforcement', 'by City, 2013', 'City'])
# Do the same for the bad columns
del dta['Unnamed: 4']
del dta['Unnamed: 5']
# Rename the colums into what they actually are
# Keep in mind that these names will be
# referenced later using dmatrices.
dta = dta.rename(columns = {'Unnamed: 1': 'Population',
'Unnamed: 2': 'Violent_Crime',
'Unnamed: 3': 'Murder',
'Unnamed: 6': 'Robbery',
'Unnamed: 7': 'Agg_Assault',
'Unnamed: 8': 'Property_Crime',
'Unnamed: 9': 'Burglary',
'Unnamed: 10': 'Larceny_Theft',
'Unnamed: 11': 'Motor_Veh_Theft',
'Unnamed: 12': 'Arson'
}
)
# Turn all the objects in the table into floats
dta = dta.convert_objects(convert_numeric=True)
dta.dtypes
# Check the data
# dta
# I want to 'boolianize' the murder values, just
# to check for the presence of murders, not sheer number.
dta['Murder_bool'] = (dta.Murder > 0).astype(int)
# Check for the new column to make sure it
# is working correctly.
dta.Murder_bool
# Check out some useful stats on the cities with
# murders(Murder_bool = 1) and those without(Murder_bool = 0).
dta.groupby('Murder_bool').mean()
# How does this relate to El Paso in particular?
dta.loc['El Paso']
# Here I use dmatrices to set up logistic regression.
# All the entries to the right of the tilde are inputs
# while Murder_bool is what I am attempting to model.
y, X = dmatrices('Murder_bool ~ Population + Violent_Crime + Robbery + Agg_Assault + Property_Crime \
+ Burglary + Larceny_Theft + Motor_Veh_Theft + Arson',
dta, return_type="dataframe")
# The following command turns a 1-D df into an array.
y = np.ravel(y)
model = LogisticRegression()
model = model.fit(X, y)
# Check the accuracy
print(model.score(X, y), y.mean())
# Now lets look a the coefficients.
pd.DataFrame(list(zip(X.columns, np.transpose(model.coef_))))
# Note that py35 treats zips as an
# iterable, so pd.DataFrame could not
# use the zip. To get around this,
# I had to turn the zip into a list.
dta['Violent_Crime_per_Capita'] = dta.Violent_Crime/dta.Population
dta['Murder_per_capita'] = dta.Murder/dta.Population
dta['Robbery_per_Capita'] = dta.Robbery/dta.Population
dta['Agg_Assualt_per_Capita'] = dta.Agg_Assault/dta.Population
dta['Property_Crime_per_Capita'] = dta.Property_Crime/dta.Population
dta['Burglary_per_Capita'] = dta.Burglary/dta.Population
dta['Larceny_Theft_per_Capita'] = dta.Larceny_Theft/dta.Population
dta['Motor_Veh_Theft_per_Capita'] = dta.Motor_Veh_Theft/dta.Population
dta['Arson_per_Capita'] = dta.Arson/dta.Population
del dta['Population']
del dta['Violent_Crime']
del dta['Murder']
del dta['Robbery']
del dta['Agg_Assault']
del dta['Property_Crime']
del dta['Burglary']
del dta['Larceny_Theft']
del dta['Motor_Veh_Theft']
del dta['Arson']
# Prep for logistic regression again.
y, X = dmatrices('Murder_bool ~ Violent_Crime_per_Capita + Robbery_per_Capita + Agg_Assualt_per_Capita\
+ Property_Crime_per_Capita + Burglary_per_Capita + Larceny_Theft_per_Capita\
+ Motor_Veh_Theft_per_Capita + Arson_per_Capita',
dta, return_type="dataframe")
# The following command turns a 1-D df into an array.
y = np.ravel(y)
model = LogisticRegression()
model = model.fit(X, y)
# Check the accuracy
print(model.score(X, y), y.mean())
pd.DataFrame(list(zip(X.columns, np.transpose(model.coef_))))
dta.groupby('Murder_bool').mean()
dta.loc['El Paso']