##AAQ1 Question 1a
##codebook install packages
## Warning: package 'haven' was built under R version 4.1.2
## Warning: package 'tidyverse' was built under R version 4.1.2
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.5 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.1 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.1.2
## Warning: package 'tidyr' was built under R version 4.1.2
## Warning: package 'readr' was built under R version 4.1.2
## Warning: package 'purrr' was built under R version 4.1.2
## Warning: package 'dplyr' was built under R version 4.1.2
## Warning: package 'forcats' was built under R version 4.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## Warning: package 'summarytools' was built under R version 4.1.2
##
## Attaching package: 'summarytools'
## The following object is masked from 'package:tibble':
##
## view
data<- USArrests
select(data,Murder,Assault, UrbanPop,Rape)-> dfr
print(dfSummary(dfr,graph.magnif=.75),method='render')
| No | Variable | Stats / Values | Freqs (% of Valid) | Graph | Valid | Missing | ||||
|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Murder [numeric] |
|
43 distinct values | 50 (100.0%) | 0 (0.0%) | |||||
| 2 | Assault [integer] |
|
45 distinct values | 50 (100.0%) | 0 (0.0%) | |||||
| 3 | UrbanPop [integer] |
|
36 distinct values | 50 (100.0%) | 0 (0.0%) | |||||
| 4 | Rape [numeric] |
|
48 distinct values | 50 (100.0%) | 0 (0.0%) |
Generated by summarytools 1.0.0 (R version 4.1.1)
2021-12-31
##Exploratory Data Analysis
load the libraries
library(ggplot2)
library(tidyverse)
library(ggvis)
## Warning: package 'ggvis' was built under R version 4.1.2
##
## Attaching package: 'ggvis'
## The following object is masked from 'package:ggplot2':
##
## resolution
loading the dataset
Arrest<-read.csv('C:/Users/jxsoo/Dropbox/PC (2)/Downloads/USArrests.csv')
to view the dataset
view(Arrest)
## x must either be a summarytools object created with freq(), descr(), or a list of summarytools objects created using by()
Since the first column missing the name, then we need to add it
colnames(Arrest)[1] <- "US_States";
to find the total number of variables and observation
glimpse(Arrest)
## Rows: 50
## Columns: 5
## $ US_States <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "California", "C~
## $ Murder <dbl> 13.2, 10.0, 8.1, 8.8, 9.0, 7.9, 3.3, 5.9, 15.4, 17.4, 5.3, 2~
## $ Assault <int> 236, 263, 294, 190, 276, 204, 110, 238, 335, 211, 46, 120, 2~
## $ UrbanPop <int> 58, 48, 80, 50, 91, 78, 77, 72, 80, 60, 83, 54, 83, 65, 57, ~
## $ Rape <dbl> 21.2, 44.5, 31.0, 19.5, 40.6, 38.7, 11.1, 15.8, 31.9, 25.8, ~
Since the datasets are cleaned then we can visualize it ##barplot barplot to find the relation between arrests and US_States. ####US_State versus Murder Arrest
##histogram see how the urban population distributed in US_States ##boxplot boxplot indicates that on average more arrests are made due to rape assaults
##Scatterplot to find out do the highly ubran population have more crimes
##Q1b loading packages
library(dplyr)
Create my own dataset with 20 observation and 5 variables
## umst_id umst_gender age heightInm weightInkg
## 1 u001 M 18 1.80 80
## 2 u002 M 19 1.70 65
## 3 u003 F 20 1.65 51
## 4 u004 F 20 1.60 48
## 5 u005 M 18 1.73 75
## 6 u006 F 19 1.50 39
## 7 u007 M 23 1.65 53
## 8 u008 M 24 1.78 73
## 9 u009 F 25 1.55 40
## 10 u010 F 23 1.60 55
## 11 u011 M 23 1.83 67
## 12 u012 M 20 1.90 70
## 13 u013 F 19 1.75 70
## 14 u014 M 18 1.65 63
## 15 u015 F 18 1.54 52
## 16 u016 F 24 1.63 55
## 17 u017 M 23 1.78 73
## 18 u018 F 21 1.56 54
## 19 u019 F 21 1.56 42
## 20 u020 M 22 1.80 70
##filter the student whose weight exceed 70
filter(student_BMI, weightInkg>70)
## umst_id umst_gender age heightInm weightInkg
## 1 u001 M 18 1.80 80
## 2 u005 M 18 1.73 75
## 3 u008 M 24 1.78 73
## 4 u017 M 23 1.78 73
##we can also filter two variable using &
filter(student_BMI, weightInkg>70 &heightInm >1.70)
## umst_id umst_gender age heightInm weightInkg
## 1 u001 M 18 1.80 80
## 2 u005 M 18 1.73 75
## 3 u008 M 24 1.78 73
## 4 u017 M 23 1.78 73
##2.Arrange
##Arrange the um Students according to the age
arrange(student_BMI, age)
## umst_id umst_gender age heightInm weightInkg
## 1 u001 M 18 1.80 80
## 2 u005 M 18 1.73 75
## 3 u014 M 18 1.65 63
## 4 u015 F 18 1.54 52
## 5 u002 M 19 1.70 65
## 6 u006 F 19 1.50 39
## 7 u013 F 19 1.75 70
## 8 u003 F 20 1.65 51
## 9 u004 F 20 1.60 48
## 10 u012 M 20 1.90 70
## 11 u018 F 21 1.56 54
## 12 u019 F 21 1.56 42
## 13 u020 M 22 1.80 70
## 14 u007 M 23 1.65 53
## 15 u010 F 23 1.60 55
## 16 u011 M 23 1.83 67
## 17 u017 M 23 1.78 73
## 18 u008 M 24 1.78 73
## 19 u016 F 24 1.63 55
## 20 u009 F 25 1.55 40
## we can also arrange it in descending order
arrange(student_BMI, desc(age))
## umst_id umst_gender age heightInm weightInkg
## 1 u009 F 25 1.55 40
## 2 u008 M 24 1.78 73
## 3 u016 F 24 1.63 55
## 4 u007 M 23 1.65 53
## 5 u010 F 23 1.60 55
## 6 u011 M 23 1.83 67
## 7 u017 M 23 1.78 73
## 8 u020 M 22 1.80 70
## 9 u018 F 21 1.56 54
## 10 u019 F 21 1.56 42
## 11 u003 F 20 1.65 51
## 12 u004 F 20 1.60 48
## 13 u012 M 20 1.90 70
## 14 u002 M 19 1.70 65
## 15 u006 F 19 1.50 39
## 16 u013 F 19 1.75 70
## 17 u001 M 18 1.80 80
## 18 u005 M 18 1.73 75
## 19 u014 M 18 1.65 63
## 20 u015 F 18 1.54 52
## Sort it with gender and also the age
arrange(student_BMI, umst_gender, age)
## umst_id umst_gender age heightInm weightInkg
## 1 u015 F 18 1.54 52
## 2 u006 F 19 1.50 39
## 3 u013 F 19 1.75 70
## 4 u003 F 20 1.65 51
## 5 u004 F 20 1.60 48
## 6 u018 F 21 1.56 54
## 7 u019 F 21 1.56 42
## 8 u010 F 23 1.60 55
## 9 u016 F 24 1.63 55
## 10 u009 F 25 1.55 40
## 11 u001 M 18 1.80 80
## 12 u005 M 18 1.73 75
## 13 u014 M 18 1.65 63
## 14 u002 M 19 1.70 65
## 15 u012 M 20 1.90 70
## 16 u020 M 22 1.80 70
## 17 u007 M 23 1.65 53
## 18 u011 M 23 1.83 67
## 19 u017 M 23 1.78 73
## 20 u008 M 24 1.78 73
##3. mutate()
## find the BMI of each student
BMI_result<-mutate(student_BMI, BMI= (weightInkg)/(heightInm*heightInm))
##4.select()
##Select the column between height and BMI
select(BMI_result, heightInm:BMI)
## heightInm weightInkg BMI
## 1 1.80 80 24.69136
## 2 1.70 65 22.49135
## 3 1.65 51 18.73278
## 4 1.60 48 18.75000
## 5 1.73 75 25.05931
## 6 1.50 39 17.33333
## 7 1.65 53 19.46740
## 8 1.78 73 23.04002
## 9 1.55 40 16.64932
## 10 1.60 55 21.48437
## 11 1.83 67 20.00657
## 12 1.90 70 19.39058
## 13 1.75 70 22.85714
## 14 1.65 63 23.14050
## 15 1.54 52 21.92613
## 16 1.63 55 20.70082
## 17 1.78 73 23.04002
## 18 1.56 54 22.18935
## 19 1.56 42 17.25838
## 20 1.80 70 21.60494
## see all the coloumn except gender , by adding '-' sign
select(BMI_result, -umst_gender)
## umst_id age heightInm weightInkg BMI
## 1 u001 18 1.80 80 24.69136
## 2 u002 19 1.70 65 22.49135
## 3 u003 20 1.65 51 18.73278
## 4 u004 20 1.60 48 18.75000
## 5 u005 18 1.73 75 25.05931
## 6 u006 19 1.50 39 17.33333
## 7 u007 23 1.65 53 19.46740
## 8 u008 24 1.78 73 23.04002
## 9 u009 25 1.55 40 16.64932
## 10 u010 23 1.60 55 21.48437
## 11 u011 23 1.83 67 20.00657
## 12 u012 20 1.90 70 19.39058
## 13 u013 19 1.75 70 22.85714
## 14 u014 18 1.65 63 23.14050
## 15 u015 18 1.54 52 21.92613
## 16 u016 24 1.63 55 20.70082
## 17 u017 23 1.78 73 23.04002
## 18 u018 21 1.56 54 22.18935
## 19 u019 21 1.56 42 17.25838
## 20 u020 22 1.80 70 21.60494
## to see only BMI
select(BMI_result,BMI)
## BMI
## 1 24.69136
## 2 22.49135
## 3 18.73278
## 4 18.75000
## 5 25.05931
## 6 17.33333
## 7 19.46740
## 8 23.04002
## 9 16.64932
## 10 21.48437
## 11 20.00657
## 12 19.39058
## 13 22.85714
## 14 23.14050
## 15 21.92613
## 16 20.70082
## 17 23.04002
## 18 22.18935
## 19 17.25838
## 20 21.60494
##5.summarise()
##find the maximum BMI
summarise(BMI_result,max(BMI))
## max(BMI)
## 1 25.05931
##find the minimum BMI
summarise(BMI_result,min(BMI))
## min(BMI)
## 1 16.64932
## find whether have um students got BMI 20 and above
summarise(BMI_result, any(BMI>20))
## any(BMI > 20)
## 1 TRUE