print("Data source: https://data.baltimorecity.gov/Public-Safety/911-Police-Calls-for-Service/xviu-ezkt")
## Data source: https://data.baltimorecity.gov/Public-Safety/911-Police-Calls-for-Service/xviu-ezkt
def autolabel(rects,p,ax,symbol):
for rect in rects:
h = rect.get_height()
ax.text(rect.get_x()+rect.get_width()/2,h*1.01,symbol+format(h,p),fontsize=10,ha='center',color='black',va='bottom')
def generateBaseMap(default_location=[39.2858, -76.6206], default_zoom_start=11):
base_map = folium.Map(location=default_location, control_scale=True, zoom_start=default_zoom_start)
return base_map
print("Python System Version:"+sys.version)
## Python System Version:3.7.4 (tags/v3.7.4:e09359112e, Jul 8 2019, 20:34:20) [MSC v.1916 64 bit (AMD64)]
input_file = 'C:/work/DataScience/GB736/assignment-04/911_Police_Calls_for_Service.csv'
sns.set(style="darkgrid")
my_type_colors= sns.color_palette("muted")
## Reading input csv file into dataframe
start_time = time.time()
rows = 100000
#df0 = pd.read_csv(input_file)
df0 = pd.read_csv(input_file,nrows=rows)
print("Reading took %s seconds to read %d lines." % ((time.time() - start_time), df0.shape[0]))
#df0.isna().sum()
## Reading took 1.2689998149871826 seconds to read 100000 lines.
print("Data shape:"+str(df0.shape))
## Data shape:(100000, 20)
Data manipulation
- Filter and consider rows where ZipCode is not NA
- Replace None with NA and drop all NA
- Break location column into Street, City and Location (Long & Lat)
- Convert data types to int and string
- Break CallDateTime into Call Year, Month, Day of week, Hour, Minute and Seconds
- Remove CallDateTime, Loc and Location columns after breaking them into seperate columns
- Taking % Sample for initial analysis
df0 = df0[df0['ZipCode'].notna()]
df0 = df0.replace(to_replace='None', value=np.nan).dropna()
df0.head(2)
## RecordID CallNumber ... 2010 Census Wards Precincts Zip Codes
## 10 7473387 P200581839 ... 38.0 27937.0
## 12 7473302 P200581709 ... 39.0 27301.0
##
## [2 rows x 20 columns]
df0[["Street","City","Loc"]] = df0.Location.str.split("\n", expand=True)
df0['Loc']= df0['Loc'].apply(lambda x: str(x).strip("("))
df0['Loc']= df0['Loc'].apply(lambda x: str(x).strip(")"))
df0[['long','lat']] =df0.Loc.str.split(", ",expand=True)
convert_dict = {'ZipCode': int,'CouncilDistrict':int,'PolicePost':int}
df0 = df0.astype(convert_dict)
convert_dict = {'ZipCode': str,'CouncilDistrict':str,'PolicePost':str}
df0 = df0.astype(convert_dict)
df0['CallDate'] = df0.apply(lambda row: datetime.strptime(row["CallDateTime"], "%m/%d/%Y %H:%M:%S %p").date,axis=1)
df0['CallYear'] = df0.apply(lambda row: datetime.strptime(row["CallDateTime"], "%m/%d/%Y %H:%M:%S %p").year,axis=1)
df0['CallMonth'] = df0.apply(lambda row: datetime.strptime(row["CallDateTime"], "%m/%d/%Y %H:%M:%S %p").month,axis=1)
df0['CallDay'] = df0.apply(lambda row: datetime.strptime(row["CallDateTime"], "%m/%d/%Y %H:%M:%S %p").day,axis=1)
df0['CallWeekday']= df0.apply(lambda row: calendar.day_name[datetime.strptime(row["CallDateTime"], "%m/%d/%Y %H:%M:%S %p").weekday()],axis=1)
#df0['CallHour']= df0.apply(lambda row: datetime.strptime(row["CallDateTime"], "%m/%d/%Y %H:%M:%S %p").hour,axis=1)
df0['CallHour'] = df0.apply(lambda row: pd.to_datetime(row['CallDateTime'],format="%m/%d/%Y %I:%M:%S %p").hour,axis=1)
df0['CallMin']= df0.apply(lambda row: datetime.strptime(row["CallDateTime"], "%m/%d/%Y %H:%M:%S %p").minute,axis=1)
df0['CallSec']= df0.apply(lambda row: datetime.strptime(row["CallDateTime"], "%m/%d/%Y %H:%M:%S %p").second,axis=1)
df0=df0.drop(['Loc','Location'],axis=1)
df = df0.sample(frac =1.0)
Ploting Pie Chart of 911 call distribution by Call Priority
df0[df0.CallHour>12].head(10)
## RecordID CallNumber CallDateTime ... CallHour CallMin CallSec
## 193 7473935 P200582467 02/27/2020 02:01:00 PM ... 14 1 0
## 210 7473878 P200582420 02/27/2020 01:44:00 PM ... 13 44 0
## 244 7473950 P200582468 02/27/2020 02:00:00 PM ... 14 0 0
## 245 7473960 P200582391 02/27/2020 01:37:00 PM ... 13 37 0
## 247 7473900 P200582370 02/27/2020 01:33:00 PM ... 13 33 0
## 267 7473989 P200582524 02/27/2020 02:17:00 PM ... 14 17 0
## 271 7473931 P200582373 02/27/2020 01:34:00 PM ... 13 34 0
## 274 7473818 P200582327 02/27/2020 01:18:00 PM ... 13 18 0
## 293 7473972 P200582482 02/27/2020 02:05:00 PM ... 14 5 0
## 339 7473938 P200582443 02/27/2020 01:55:00 PM ... 13 55 0
##
## [10 rows x 31 columns]
dfp1 = df.groupby(['Priority'],as_index=False).agg({'RecordID':'count'})
dfp1.rename(columns={'RecordID':'CallCount'},inplace=True)
dfp1.sort_values(by=['CallCount'], ascending=False)
## Priority CallCount
## 3 Non-Emergency 5448
## 2 Medium 660
## 1 Low 396
## 0 High 180
dfp1.head(10)
## Priority CallCount
## 0 High 180
## 1 Low 396
## 2 Medium 660
## 3 Non-Emergency 5448
matplotlib.style.use('ggplot')
fig = plt.figure(figsize=(8,8))
labels =dfp1.Priority #['wages','taxes','fees']
colors = ['red','green','blue','purple','orange']
mydata =dfp1.CallCount #[100,100,100]
explode = np.arange(dfp1.Priority.count(), dtype=np.float)
patches,texts,autotexts = plt.pie(mydata,labels=labels,colors=my_type_colors,autopct='%1.1f%%',shadow=True, startangle= 160)
[eachlabel.set_fontsize(18) for eachlabel in texts ]
## [None, None, None, None]
[eachlabel.set_fontsize(18) for eachlabel in autotexts ]
## [None, None, None, None]
[eachlabel.set_color('white') for eachlabel in autotexts ]
## [None, None, None, None]
autotexts[0].set_color('Black')
texts[0].set_fontsize(20)
plt.axis('equal')
## (-1.1006644869279618, 1.1058262695886127, -1.1061446313786636, 1.1050543291946804)
plt.title('911 Calls distribution by Call Priority',fontsize=20,color='Blue')
plt.show()
