# -*- coding: utf-8 -*-
"""
Created on Mon Aug 22 22:03:44 2016
@author: Tammy
"""
"""
######################################################
#--------- SCRAPING CODE FROM PART ONE ---------------
######################################################
"""
import urllib2
from bs4 import BeautifulSoup
import re
import pandas as pd
# define base URL for county website
base = 'http://apps2.polkcountyiowa.gov/inmatesontheweb/'
# empty tuple that we will fill with dicts later
master = []
# get the link IDs for each inmate page so that we can iterate over each
url = urllib2.urlopen(base)
doc = url.read()
src = BeautifulSoup(doc, 'html.parser')
arrests = src.find_all('a') # find all link tags on our table of contents page
pattern = re.compile(".*(bi)") # the pattern we're looking for in the links
# iterate through each link, check that it's the type we want, grab the rest of
# the URL, then parse through each arrest page to grab info
for arrest in range(1, len(arrests)):
if pattern.match(arrests[arrest]['href']) :
page = arrests[arrest]['href']
site = base + page
url2 = urllib2.urlopen(site)
doc2 = url2.read()
src2 = BeautifulSoup(doc2, 'html.parser')
# create a list of all td tags on the arrest record page
td = src2.find_all('td')
# clean up the td tag strings
b = list()
for t in td:
b.append(str(t.string.strip()))
# define a dictionary of all the info housed in the tags
info = {'ID': b[0], 'Name': b[1], 'Arrest date/time': b[2], 'Age': int(b[3]),
'Height': b[4], 'Weight': int(b[5]), 'Race': b[6], 'Sex': b[7],
'Eyes': b[8], 'Hair': b[9], 'Case #': b[10], 'Description': b[11],
'Bond': b[12], 'Bond Type': b[13]}
#add the latest info to a master tuple
master.append(info)
# turn into a data frame that we can play with
data = pd.DataFrame(master)
# convert certain columns to categoricals
cats = ['Race', 'Sex', 'Eyes', 'Hair', 'Description', 'Bond Type']
for cat in cats:
data[cat] = data[cat].astype('category')
"""
######################################################
#----------- NEW CODE FOR PART THREE------------------
######################################################
"""
import nltk
from nltk.stem.porter import *
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import re
# unique arrest descriptions and their frequencies
offenses = data.Description.value_counts()
# several common words in the list of offenses so let's manually bucket them
# and see how those buckets are distributed
offense_bucket = list()
for each in data.Description:
if "VIOLATION" in each:
new = "Violation"
elif "POSSESS" in each:
new = "Possession"
elif "ASSAULT" in each:
new = "Assault"
elif "THEFT" in each:
new = "Theft"
elif "MURDER" in each:
new = "Murder"
elif "NARC" in each:
new = "Narcotics"
else:
new = "Other"
offense_bucket.append(new)
data['OffenseBucket'] = pd.Series(offense_bucket, index=data.index)
data.OffenseBucket.value_counts()
# the "Other" bucket is the largest bucket, so we're clearly missing some other
# categories.
"""
Alternatively, we can allow the arrest types to cluster themselves using
text mining techniques. To keep things clear, from now on let's refer to each
individual offense description as a "document", and each unique word and/or
item of punctuation as a "token".
"""
# let's get down to just the unique documents:
unique_docs = list(set(data.Description))
# let's pause and see some basic stats on the text data we're working with.
# unique documents:
print("There are %d unique arrest types." %len(unique_docs))
unique_docs[1:5]
# clean up our data to remove numbers, symbols, and words like "violation", "degree", "offense"
# in such short documents, words like "degree" can seem disproportionately important
# I don't care too much about the severity of the crime so much as the type, so
# I don't want to create a cluster that contains all the 1st, 2nd, and 3rd degree crimes
# stem so terms like "possession" and "possess" are treated as the same
stemmer = PorterStemmer()
clean_docs = []
for doc in unique_docs:
word_list = []
for word in nltk.word_tokenize(doc):
if word.isalpha() and word not in ["DEGREE", "OFFENSE", "VIOLATION", "OFFENDER", "FAILURE"]:
word_list.append(stemmer.stem(word.lower()))
word_list2 = " ".join(word_list)
clean_docs.append(word_list2)
clean_docs[1:5]
# calculate the term frequency-inverse document frequency for each document.
# this is a way of calculating how important each token is in the context of the
# document, while also weighing how frequently each token happens across documents.
# if a term happens frequently in a single document but not frequently across all documents
# it is important to that document.
# the default settings for TfidfVectorizer take care of removing symbols and punctuation,
# and converting to lowercase
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1,2), stop_words='english')
tfidf_matrix = tfidf.fit_transform(clean_docs)
print(tfidf_matrix.shape)
# the first number is our number of unique documents, and the second is the number
# of unique terms
# now we use k-means clustering to define k clusters. We don't know what the best
# number of clusters is, so we try a bunch and will compare them later.
inertia = []
ktrials = []
for k in range(2, 30):
km = KMeans(n_clusters=k, n_init=100, random_state=12345, init='k-means++')
km.fit(tfidf_matrix)
inertia.append(km.inertia_)
ktrials.append(float(k))
# plot our results and look for an "elbow"
plt.plot(ktrials, inertia)
plt.ylabel('inertia')
plt.xlabel('clusters')
plt.show()
# closest thing to an "elbow" is at 8, but this leaves out some important categories
# let's expand to 15 - more than that would be onerous
km = KMeans(n_clusters=15, n_init=100, random_state=12345, init='k-means++')
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()
# join back to the original, unprocessed list of arrest descriptions
results = pd.DataFrame(unique_docs, index = [clusters], columns = ['Description'])
results['Cluster'] = results.index
# number of arrest descriptions in each cluster:
results.index.value_counts(sort=False)
# what arrest descriptions are in each cluster?
[results.loc[i] for i in range(0, 15)]
# some of these work quite well - there's a narcotics cluster, a theft cluster, etc.
# but there is definitely a "miscellaneous" category that doesn't really go away
# even when expanding the number of clusters. If I expand to 10 or 15 clusters,
# I get more categories like intoxication, driving offenses, murder, and evenStill, it's better than my manual
# assignments.
# giving the clusters names. Not a lot of overlap although a sizeable miscellaneous category
results.loc[results['Cluster'] == 0, 'ClusterName'] = 'Criminal Mischief'
results.loc[results['Cluster'] == 1, 'ClusterName'] = 'Theft'
results.loc[results['Cluster'] == 2, 'ClusterName'] = 'Weapons'
results.loc[results['Cluster'] == 3, 'ClusterName'] = 'Misc'
results.loc[results['Cluster'] == 4, 'ClusterName'] = 'Controlled Substances'
results.loc[results['Cluster'] == 5, 'ClusterName'] = 'Driving Under Influence'
results.loc[results['Cluster'] == 6, 'ClusterName'] = 'Domestic/Sexual Abuse'
results.loc[results['Cluster'] == 7, 'ClusterName'] = 'Trespassing'
results.loc[results['Cluster'] == 8, 'ClusterName'] = 'Murder'
results.loc[results['Cluster'] == 9, 'ClusterName'] = 'Injury'
results.loc[results['Cluster'] == 10, 'ClusterName'] = 'Burglary'
results.loc[results['Cluster'] == 11, 'ClusterName'] = 'Narcotics'
results.loc[results['Cluster'] == 12, 'ClusterName'] = 'Arson'
results.loc[results['Cluster'] == 13, 'ClusterName'] = 'Harrassment'
results.loc[results['Cluster'] == 14, 'ClusterName'] = 'Intoxication/Drunk Driving'
# merge this back with our full arrest data
data = pd.merge(data, results, on='Description')
data.ClusterName.value_counts()
# still a large misc category, but a good distribution of other categories
""" how are the arrest types distributed across all individuals? by race? by gender?
time to do some graphing again
"""
import plotly
import plotly.graph_objs as go
import plotly.plotly as py
# setup for our tools
plotly.tools.set_credentials_file(username='tammylarmstrong', api_key='########')
# formatting
legend = dict(x=0, y=100)
def stack_layout(chartitle):
global stack
stack = go.Layout(barmode='stack',
xaxis=dict(tickangle=45),
title = chartitle,
legend=legend)
# compare offense types by gender
ct = pd.crosstab(data.ClusterName, [data.Sex], rownames=['Offense Type'], colnames=['Sex'])
data.ClusterName = data.ClusterName.astype('category')
arrestcats = data.ClusterName.cat.categories
female = go.Bar(x=arrestcats.tolist(),
y=ct.Female.values.tolist(),
name = 'Female')
male = go.Bar(x=arrestcats.tolist(),
y=ct.Male.values.tolist(),
name = 'Male')
sex_clusters = [male, female]
stack_layout('Arrest Types by Gender')
fig1 = go.Figure(data=sex_clusters, layout=stack)
py.plot(fig1, filename='arrest-types-by-gender')
# show race as a percentage of total arrests of that type
from __future__ import division
ct2 = pd.crosstab(data.ClusterName, data.Race).apply(lambda r: r/r.sum(), axis=1)
ct2_round = ct2.round(2)
Black = go.Bar(x=arrestcats.tolist(),
y=ct2_round.Black.values.tolist(),
name = 'Black')
White = go.Bar(x=arrestcats.tolist(),
y=ct2_round.White.values.tolist(),
name = 'White')
Asian = go.Bar(x=arrestcats.tolist(),
y=ct2_round.Asian.values.tolist(),
name = 'Asian')
Pacific = go.Bar(x=arrestcats.tolist(),
y=ct2_round['Pacific Islander'].values.tolist(),
name = 'Pacific Islander')
Unknown = go.Bar(x=arrestcats.tolist(),
y=ct2_round['Unknown'].values.tolist(),
name='Unknown')
race_clusters = [Black, White, Asian, Pacific, Unknown]
layout = go.Layout(barmode='stack',
title='Race as percentage of each arrest type',
legend=legend,
yaxis=dict(range=[0,1]),
annotations=[
dict(x=xi,y=[0.9]*13,
text=str(yi),
xanchor='center',
yanchor='bottom',
showarrow=False,
) for xi, yi in zip(arrestcats, data.ClusterName.value_counts(sort=False))]
)
fig2 = go.Figure(data=race_clusters, layout=layout)
py.plot(fig2, filename='arrest-types-by-race')