##AAQ1 Question 1a

##codebook install packages

## Warning: package 'haven' was built under R version 4.1.2
## Warning: package 'tidyverse' was built under R version 4.1.2
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.5     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.1     v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.1.2
## Warning: package 'tidyr' was built under R version 4.1.2
## Warning: package 'readr' was built under R version 4.1.2
## Warning: package 'purrr' was built under R version 4.1.2
## Warning: package 'dplyr' was built under R version 4.1.2
## Warning: package 'forcats' was built under R version 4.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## Warning: package 'summarytools' was built under R version 4.1.2
## 
## Attaching package: 'summarytools'
## The following object is masked from 'package:tibble':
## 
##     view
data<- USArrests
select(data,Murder,Assault, UrbanPop,Rape)-> dfr
print(dfSummary(dfr,graph.magnif=.75),method='render')

Data Frame Summary

dfr

Dimensions: 50 x 4
Duplicates: 0
No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
1 Murder [numeric]
Mean (sd) : 7.8 (4.4)
min ≤ med ≤ max:
0.8 ≤ 7.2 ≤ 17.4
IQR (CV) : 7.2 (0.6)
43 distinct values 50 (100.0%) 0 (0.0%)
2 Assault [integer]
Mean (sd) : 170.8 (83.3)
min ≤ med ≤ max:
45 ≤ 159 ≤ 337
IQR (CV) : 140 (0.5)
45 distinct values 50 (100.0%) 0 (0.0%)
3 UrbanPop [integer]
Mean (sd) : 65.5 (14.5)
min ≤ med ≤ max:
32 ≤ 66 ≤ 91
IQR (CV) : 23.2 (0.2)
36 distinct values 50 (100.0%) 0 (0.0%)
4 Rape [numeric]
Mean (sd) : 21.2 (9.4)
min ≤ med ≤ max:
7.3 ≤ 20.1 ≤ 46
IQR (CV) : 11.1 (0.4)
48 distinct values 50 (100.0%) 0 (0.0%)

Generated by summarytools 1.0.0 (R version 4.1.1)
2021-12-31

##Exploratory Data Analysis

load the libraries

library(ggplot2)
library(tidyverse)
library(ggvis)
## Warning: package 'ggvis' was built under R version 4.1.2
## 
## Attaching package: 'ggvis'
## The following object is masked from 'package:ggplot2':
## 
##     resolution

loading the dataset

Arrest<-read.csv('C:/Users/jxsoo/Dropbox/PC (2)/Downloads/USArrests.csv')

to view the dataset

view(Arrest)
## x must either be a summarytools object created with freq(), descr(), or a list of summarytools objects created using by()

Since the first column missing the name, then we need to add it

colnames(Arrest)[1] <- "US_States";

to find the total number of variables and observation

glimpse(Arrest)
## Rows: 50
## Columns: 5
## $ US_States <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "California", "C~
## $ Murder    <dbl> 13.2, 10.0, 8.1, 8.8, 9.0, 7.9, 3.3, 5.9, 15.4, 17.4, 5.3, 2~
## $ Assault   <int> 236, 263, 294, 190, 276, 204, 110, 238, 335, 211, 46, 120, 2~
## $ UrbanPop  <int> 58, 48, 80, 50, 91, 78, 77, 72, 80, 60, 83, 54, 83, 65, 57, ~
## $ Rape      <dbl> 21.2, 44.5, 31.0, 19.5, 40.6, 38.7, 11.1, 15.8, 31.9, 25.8, ~

Since the datasets are cleaned then we can visualize it ##barplot barplot to find the relation between arrests and US_States. ####US_State versus Murder Arrest

##histogram see how the urban population distributed in US_States ##boxplot boxplot indicates that on average more arrests are made due to rape assaults ##Scatterplot to find out do the highly ubran population have more crimes ##Q1b loading packages

library(dplyr)

Create my own dataset with 20 observation and 5 variables

##    umst_id umst_gender age heightInm weightInkg
## 1     u001           M  18      1.80         80
## 2     u002           M  19      1.70         65
## 3     u003           F  20      1.65         51
## 4     u004           F  20      1.60         48
## 5     u005           M  18      1.73         75
## 6     u006           F  19      1.50         39
## 7     u007           M  23      1.65         53
## 8     u008           M  24      1.78         73
## 9     u009           F  25      1.55         40
## 10    u010           F  23      1.60         55
## 11    u011           M  23      1.83         67
## 12    u012           M  20      1.90         70
## 13    u013           F  19      1.75         70
## 14    u014           M  18      1.65         63
## 15    u015           F  18      1.54         52
## 16    u016           F  24      1.63         55
## 17    u017           M  23      1.78         73
## 18    u018           F  21      1.56         54
## 19    u019           F  21      1.56         42
## 20    u020           M  22      1.80         70

1.Filter()

##filter the student whose weight exceed 70
filter(student_BMI, weightInkg>70)
##   umst_id umst_gender age heightInm weightInkg
## 1    u001           M  18      1.80         80
## 2    u005           M  18      1.73         75
## 3    u008           M  24      1.78         73
## 4    u017           M  23      1.78         73
##we can also filter two variable using &
filter(student_BMI, weightInkg>70 &heightInm >1.70)
##   umst_id umst_gender age heightInm weightInkg
## 1    u001           M  18      1.80         80
## 2    u005           M  18      1.73         75
## 3    u008           M  24      1.78         73
## 4    u017           M  23      1.78         73

##2.Arrange

##Arrange the um Students according to the age 
arrange(student_BMI, age)
##    umst_id umst_gender age heightInm weightInkg
## 1     u001           M  18      1.80         80
## 2     u005           M  18      1.73         75
## 3     u014           M  18      1.65         63
## 4     u015           F  18      1.54         52
## 5     u002           M  19      1.70         65
## 6     u006           F  19      1.50         39
## 7     u013           F  19      1.75         70
## 8     u003           F  20      1.65         51
## 9     u004           F  20      1.60         48
## 10    u012           M  20      1.90         70
## 11    u018           F  21      1.56         54
## 12    u019           F  21      1.56         42
## 13    u020           M  22      1.80         70
## 14    u007           M  23      1.65         53
## 15    u010           F  23      1.60         55
## 16    u011           M  23      1.83         67
## 17    u017           M  23      1.78         73
## 18    u008           M  24      1.78         73
## 19    u016           F  24      1.63         55
## 20    u009           F  25      1.55         40
## we can also arrange it in descending order
arrange(student_BMI, desc(age))
##    umst_id umst_gender age heightInm weightInkg
## 1     u009           F  25      1.55         40
## 2     u008           M  24      1.78         73
## 3     u016           F  24      1.63         55
## 4     u007           M  23      1.65         53
## 5     u010           F  23      1.60         55
## 6     u011           M  23      1.83         67
## 7     u017           M  23      1.78         73
## 8     u020           M  22      1.80         70
## 9     u018           F  21      1.56         54
## 10    u019           F  21      1.56         42
## 11    u003           F  20      1.65         51
## 12    u004           F  20      1.60         48
## 13    u012           M  20      1.90         70
## 14    u002           M  19      1.70         65
## 15    u006           F  19      1.50         39
## 16    u013           F  19      1.75         70
## 17    u001           M  18      1.80         80
## 18    u005           M  18      1.73         75
## 19    u014           M  18      1.65         63
## 20    u015           F  18      1.54         52
## Sort it with gender and also the age
arrange(student_BMI, umst_gender,  age)
##    umst_id umst_gender age heightInm weightInkg
## 1     u015           F  18      1.54         52
## 2     u006           F  19      1.50         39
## 3     u013           F  19      1.75         70
## 4     u003           F  20      1.65         51
## 5     u004           F  20      1.60         48
## 6     u018           F  21      1.56         54
## 7     u019           F  21      1.56         42
## 8     u010           F  23      1.60         55
## 9     u016           F  24      1.63         55
## 10    u009           F  25      1.55         40
## 11    u001           M  18      1.80         80
## 12    u005           M  18      1.73         75
## 13    u014           M  18      1.65         63
## 14    u002           M  19      1.70         65
## 15    u012           M  20      1.90         70
## 16    u020           M  22      1.80         70
## 17    u007           M  23      1.65         53
## 18    u011           M  23      1.83         67
## 19    u017           M  23      1.78         73
## 20    u008           M  24      1.78         73

##3. mutate()

## find the BMI of each student
BMI_result<-mutate(student_BMI,  BMI= (weightInkg)/(heightInm*heightInm))

##4.select()

##Select the column between height and BMI
select(BMI_result, heightInm:BMI)
##    heightInm weightInkg      BMI
## 1       1.80         80 24.69136
## 2       1.70         65 22.49135
## 3       1.65         51 18.73278
## 4       1.60         48 18.75000
## 5       1.73         75 25.05931
## 6       1.50         39 17.33333
## 7       1.65         53 19.46740
## 8       1.78         73 23.04002
## 9       1.55         40 16.64932
## 10      1.60         55 21.48437
## 11      1.83         67 20.00657
## 12      1.90         70 19.39058
## 13      1.75         70 22.85714
## 14      1.65         63 23.14050
## 15      1.54         52 21.92613
## 16      1.63         55 20.70082
## 17      1.78         73 23.04002
## 18      1.56         54 22.18935
## 19      1.56         42 17.25838
## 20      1.80         70 21.60494
## see all the coloumn except gender , by adding '-' sign
select(BMI_result, -umst_gender)
##    umst_id age heightInm weightInkg      BMI
## 1     u001  18      1.80         80 24.69136
## 2     u002  19      1.70         65 22.49135
## 3     u003  20      1.65         51 18.73278
## 4     u004  20      1.60         48 18.75000
## 5     u005  18      1.73         75 25.05931
## 6     u006  19      1.50         39 17.33333
## 7     u007  23      1.65         53 19.46740
## 8     u008  24      1.78         73 23.04002
## 9     u009  25      1.55         40 16.64932
## 10    u010  23      1.60         55 21.48437
## 11    u011  23      1.83         67 20.00657
## 12    u012  20      1.90         70 19.39058
## 13    u013  19      1.75         70 22.85714
## 14    u014  18      1.65         63 23.14050
## 15    u015  18      1.54         52 21.92613
## 16    u016  24      1.63         55 20.70082
## 17    u017  23      1.78         73 23.04002
## 18    u018  21      1.56         54 22.18935
## 19    u019  21      1.56         42 17.25838
## 20    u020  22      1.80         70 21.60494
## to see only BMI
select(BMI_result,BMI)
##         BMI
## 1  24.69136
## 2  22.49135
## 3  18.73278
## 4  18.75000
## 5  25.05931
## 6  17.33333
## 7  19.46740
## 8  23.04002
## 9  16.64932
## 10 21.48437
## 11 20.00657
## 12 19.39058
## 13 22.85714
## 14 23.14050
## 15 21.92613
## 16 20.70082
## 17 23.04002
## 18 22.18935
## 19 17.25838
## 20 21.60494

##5.summarise()

##find the maximum BMI
summarise(BMI_result,max(BMI))
##   max(BMI)
## 1 25.05931
##find the minimum BMI
summarise(BMI_result,min(BMI))
##   min(BMI)
## 1 16.64932
## find whether have um students got BMI 20 and above 
summarise(BMI_result, any(BMI>20))
##   any(BMI > 20)
## 1          TRUE