Hello everyone! My name is William Kegg and I am a fulltime MS BANA Student. I am from Batavia, Ohio (I still commute from there) and recently graduated from UC in the Spring with undergraduate degrees in Business Analytics and Insurance & Risk Management. I am currently on my 13th internship rotation at Great American Insurance Group (6th with them specifically) as a Business Analyst for their IT Asset Management Team. Based on my work in my undergraduate program and my work experience, I would say I am between a beginner and intermediate level for R. I would say that I am at the same level for Python and SQL as well. I look forward to working with you all this semester!
As stated before, I have a basic understanding of R and other programming languages like Python and SQL. Here are a few examples of what I can do with these languages:
Import data and the message print out will also tell you
df <- readr::read_csv("Lab 2 Data Source/blood_transfusion.csv")
## Rows: 748 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Class
## dbl (4): Recency, Frequency, Monetary, Time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(df)
## spec_tbl_df [748 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Recency : num [1:748] 2 0 1 2 1 4 2 1 2 5 ...
## $ Frequency: num [1:748] 50 13 16 20 24 4 7 12 9 46 ...
## $ Monetary : num [1:748] 12500 3250 4000 5000 6000 1000 1750 3000 2250 11500 ...
## $ Time : num [1:748] 98 28 35 45 77 4 14 35 22 98 ...
## $ Class : chr [1:748] "donated" "donated" "donated" "donated" ...
## - attr(*, "spec")=
## .. cols(
## .. Recency = col_double(),
## .. Frequency = col_double(),
## .. Monetary = col_double(),
## .. Time = col_double(),
## .. Class = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
There are Numeric and Character variables.
sum(is.na(df))
## [1] 0
There are 0 missing values.
# Number of Rows
nrow(df)
## [1] 748
# Number of Columns
ncol(df)
## [1] 5
# Number of Both Columns and Rows
dim(df)
## [1] 748 5
# Length
length(df)
## [1] 5
head(df, 10)
## # A tibble: 10 × 5
## Recency Frequency Monetary Time Class
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 2 50 12500 98 donated
## 2 0 13 3250 28 donated
## 3 1 16 4000 35 donated
## 4 2 20 5000 45 donated
## 5 1 24 6000 77 not donated
## 6 4 4 1000 4 not donated
## 7 2 7 1750 14 donated
## 8 1 12 3000 35 not donated
## 9 2 9 2250 22 donated
## 10 5 46 11500 98 donated
tail(df, 10)
## # A tibble: 10 × 5
## Recency Frequency Monetary Time Class
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 23 1 250 23 not donated
## 2 23 4 1000 52 not donated
## 3 23 1 250 23 not donated
## 4 23 7 1750 88 not donated
## 5 16 3 750 86 not donated
## 6 23 2 500 38 not donated
## 7 21 2 500 52 not donated
## 8 23 3 750 62 not donated
## 9 39 1 250 39 not donated
## 10 72 1 250 72 not donated
Monetary column.
What is the value?df[100, 'Monetary']
## # A tibble: 1 × 1
## Monetary
## <dbl>
## 1 1750
Monetary column. What is the mean of
this vector?mean(df[['Monetary']])
## [1] 1378.676
Monetary is greater than the mean value. How many rows are
in the resulting data frame?above_avg <- df[['Monetary']] > mean(df[['Monetary']])
df[above_avg, 'Monetary']
## # A tibble: 267 × 1
## Monetary
## <dbl>
## 1 12500
## 2 3250
## 3 4000
## 4 5000
## 5 6000
## 6 1750
## 7 3000
## 8 2250
## 9 11500
## 10 5750
## # … with 257 more rows
df2 <- readr::read_table("Lab 2 Data Source/dayton.txt", col_names = c('Month', 'Day', 'Year', 'Avg_temp'))
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## Month = col_double(),
## Day = col_double(),
## Year = col_double(),
## Avg_temp = col_double()
## )
## Warning: 3 parsing failures.
## row col expected actual file
## 4623 -- 4 columns 5 columns 'Lab 2 Data Source/dayton.txt'
## 5016 -- 4 columns 5 columns 'Lab 2 Data Source/dayton.txt'
## 5213 -- 4 columns 5 columns 'Lab 2 Data Source/dayton.txt'
dim(df2)
## [1] 9265 4
mean(is.na(df2))
## [1] 0
df2[365, ]
## # A tibble: 1 × 4
## Month Day Year Avg_temp
## <dbl> <dbl> <dbl> <dbl>
## 1 12 31 1995 39.3
jan_2000 <- df2[df2[['Month']] == 1 & df2[['Year']] == 2000, ]
median(jan_2000[['Avg_temp']])
## [1] 27.1
df2[which.max(df2[['Avg_temp']]),]
## # A tibble: 1 × 4
## Month Day Year Avg_temp
## <dbl> <dbl> <dbl> <dbl>
## 1 7 7 2012 89.2
df2[which.min(df2[['Avg_temp']]),]
## # A tibble: 1 × 4
## Month Day Year Avg_temp
## <dbl> <dbl> <dbl> <dbl>
## 1 12 24 1998 -99
sum(df2[['Avg_temp']] == -99)
## [1] 14
mean(df2[['Avg_temp']])
## [1] 54.39876
bad_values <- df2[['Avg_temp']] == -99
df2[bad_values, 'Avg_temp'] <- NA
mean(df2[['Avg_temp']], na.rm = TRUE)
## [1] 54.6309
df3 <- readr::read_csv("Lab 2 Data Source/PDI__Police_Data_Initiative__Crime_Incidents.csv")
## Rows: 15155 Columns: 40
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (34): INSTANCEID, INCIDENT_NO, DATE_REPORTED, DATE_FROM, DATE_TO, CLSD, ...
## dbl (6): UCR, LONGITUDE_X, LATITUDE_X, TOTALNUMBERVICTIMS, TOTALSUSPECTS, ZIP
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dim(df3)
## [1] 15155 40
sum(is.na(df3))
## [1] 95592
colSums(is.na(df3))
## INSTANCEID INCIDENT_NO
## 0 0
## DATE_REPORTED DATE_FROM
## 0 2
## DATE_TO CLSD
## 9 545
## UCR DST
## 10 0
## BEAT OFFENSE
## 28 10
## LOCATION THEFT_CODE
## 2 10167
## FLOOR SIDE
## 14127 14120
## OPENING HATE_BIAS
## 14508 0
## DAYOFWEEK RPT_AREA
## 423 239
## CPD_NEIGHBORHOOD WEAPONS
## 249 5
## DATE_OF_CLEARANCE HOUR_FROM
## 2613 2
## HOUR_TO ADDRESS_X
## 9 148
## LONGITUDE_X LATITUDE_X
## 1714 1714
## VICTIM_AGE VICTIM_RACE
## 0 2192
## VICTIM_ETHNICITY VICTIM_GENDER
## 2192 2192
## SUSPECT_AGE SUSPECT_RACE
## 0 7082
## SUSPECT_ETHNICITY SUSPECT_GENDER
## 7082 7082
## TOTALNUMBERVICTIMS TOTALSUSPECTS
## 33 7082
## UCR_GROUP ZIP
## 10 1
## COMMUNITY_COUNCIL_NEIGHBORHOOD SNA_NEIGHBORHOOD
## 0 0
DATE_REPORTED column, what is the
range of dates included in this data?range(df3[['DATE_REPORTED']])
## [1] "01/01/2022 01:08:00 AM" "06/26/2022 12:50:00 AM"
table(), what is the most common age range for
known SUSPECT_AGEs?table(df3[['SUSPECT_AGE']])
##
## 18-25 26-30 31-40 41-50 51-60 61-70 OVER 70 UNDER 18
## 1778 1126 1525 659 298 121 16 629
## UNKNOWN
## 9003
table() to get the number of incidents per zip
code. Sort this table for those zip codes with the most activity to the
least activity.sort(table(df3['ZIP']), decreasing = TRUE)
##
## 45202 45205 45211 45238 45229 45219 45225 45214 45237 45223 45206 45220 45232
## 2049 1110 1094 956 913 863 811 774 699 653 616 477 477
## 45224 45209 45208 45204 45216 45227 45207 45203 45230 45213 45239 45226 45217
## 429 380 359 348 302 286 245 226 214 190 169 112 100
## 45221 45233 45212 45215 45231 45228 42502 45236 45244 45248 4523 5239
## 90 77 61 47 7 5 3 3 3 3 2 1
DAYOFWEEK column, which day do most incidents
occur on? What is the proportion of incidents that fall ontable(df3[['DAYOFWEEK']]) / sum(table(df3[['DAYOFWEEK']]))
##
## FRIDAY MONDAY SATURDAY SUNDAY THURSDAY TUESDAY WEDNESDAY
## 0.1369807 0.1438365 0.1542221 0.1448547 0.1363019 0.1432935 0.1405105
# I chose the columns of HATE_BIAS, WEAPONS, AND TOTALNUMBERVICTIMS. These seemed like the most significant for the kind of analysis I wanted to do.
colSums(is.na(df3))
## INSTANCEID INCIDENT_NO
## 0 0
## DATE_REPORTED DATE_FROM
## 0 2
## DATE_TO CLSD
## 9 545
## UCR DST
## 10 0
## BEAT OFFENSE
## 28 10
## LOCATION THEFT_CODE
## 2 10167
## FLOOR SIDE
## 14127 14120
## OPENING HATE_BIAS
## 14508 0
## DAYOFWEEK RPT_AREA
## 423 239
## CPD_NEIGHBORHOOD WEAPONS
## 249 5
## DATE_OF_CLEARANCE HOUR_FROM
## 2613 2
## HOUR_TO ADDRESS_X
## 9 148
## LONGITUDE_X LATITUDE_X
## 1714 1714
## VICTIM_AGE VICTIM_RACE
## 0 2192
## VICTIM_ETHNICITY VICTIM_GENDER
## 2192 2192
## SUSPECT_AGE SUSPECT_RACE
## 0 7082
## SUSPECT_ETHNICITY SUSPECT_GENDER
## 7082 7082
## TOTALNUMBERVICTIMS TOTALSUSPECTS
## 33 7082
## UCR_GROUP ZIP
## 10 1
## COMMUNITY_COUNCIL_NEIGHBORHOOD SNA_NEIGHBORHOOD
## 0 0
table(df3[['HATE_BIAS']])
##
## 11--ANTI-WHITE 12--ANTI-BLACK
## 7 6
## 13--ANTI-AMERICAN INDIAN/ALASKAN NATIVE 21--ANTI-JEWISH
## 6 2
## 24--ANTI-ISLAMIC (MOSLEM) 41--ANTI-MALE HOMOSEXUAL (GAY)
## 1 3
## 43--ANTI-HOMOSEXUAL (GAY AND LESBIAN) 88--DOMESTIC VIOLENCE
## 1 1241
## 89--GANG RELATED N--NO BIAS/NOT APPLICABLE
## 3 13885
table(df3[['WEAPONS']])
##
## 11 - FIREARM (TYPE NOT STATED)
## 531
## 12 - HANDGUN
## 934
## 12A - AUTOMATIC HANDGUN
## 16
## 13 - RIFLE
## 42
## 14 - SHOTGUN
## 13
## 15 - OTHER FIREARM
## 3
## 16 - IMITATION FIREARM
## 4
## 17 - SIMULATED FIREARM
## 6
## 18 - BB AND PELLET GUNS
## 32
## 20 - KNIFE/CUTTING INSTRUMENT (ICEPICK, AX, ETC.)
## 247
## 30 - BLUNT OBJECT (CLUB, HAMMER, ETC.)
## 333
## 35 - MOTOR VEHICLE (WHEN USED AS WEAPON)
## 122
## 40 - PERSONAL WEAPONS (HANDS, FEET, TEETH, ETC.)
## 2827
## 50 - POISON
## 1
## 60 - EXPLOSIVES
## 1
## 65 - FIRE/INCENDIARY DEVICE
## 7
## 70 - DRUGS/NARCOTICS/SLEEPING PILLS
## 2
## 80 - OTHER WEAPON
## 348
## 85 - ASPHYXIATION (BY DROWNING, STRANGULATION, SUFFOCATION)
## 2
## 99 - NONE
## 8106
## U - UNKNOWN
## 1573
table(df3[['TOTALNUMBERVICTIMS']])
##
## 1 2 3 4 5 6 7 8 9 10 12 13
## 12146 1919 481 228 40 120 35 42 52 10 36 13