library(tidyr) # Clean data
library(dplyr) # Manage data
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(rstatix) # Analyze data (descriptive/t-test)
##
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
##
## filter
library(rio) # Import data from Excel
library(ggplot2) # Create graphics
library(openxlsx) # Save result to Excel
library(agricolae) # Analyze data
library(lattice) # Advanced graphics
## Import data
data("USMortality")
## Show the data
USMortality
## Status Sex Cause Rate SE
## 1 Urban Male Heart disease 210.2 0.2
## 2 Rural Male Heart disease 242.7 0.6
## 3 Urban Female Heart disease 132.5 0.2
## 4 Rural Female Heart disease 154.9 0.4
## 53 Urban Male Cancer 195.9 0.2
## 54 Rural Male Cancer 219.3 0.5
## 55 Urban Female Cancer 140.2 0.2
## 56 Rural Female Cancer 150.8 0.4
## 105 Urban Male Lower respiratory 44.5 0.1
## 106 Rural Male Lower respiratory 62.8 0.3
## 107 Urban Female Lower respiratory 36.5 0.1
## 108 Rural Female Lower respiratory 46.9 0.2
## 157 Urban Male Unintentional injuries 49.6 0.1
## 158 Rural Male Unintentional injuries 71.3 0.3
## 159 Urban Female Unintentional injuries 24.7 0.1
## 160 Rural Female Unintentional injuries 37.2 0.2
## 209 Urban Male Cerebrovascular diseases 36.1 0.1
## 210 Rural Male Cerebrovascular diseases 42.2 0.2
## 211 Urban Female Cerebrovascular diseases 34.9 0.1
## 212 Rural Female Cerebrovascular diseases 42.2 0.2
## 261 Urban Male Alzheimers 19.4 0.1
## 262 Rural Male Alzheimers 21.8 0.2
## 263 Urban Female Alzheimers 25.5 0.1
## 264 Rural Female Alzheimers 30.6 0.2
## 313 Urban Male Diabetes 24.9 0.1
## 314 Rural Male Diabetes 29.5 0.2
## 315 Urban Female Diabetes 17.1 0.1
## 316 Rural Female Diabetes 21.8 0.2
## 365 Urban Male Flu and pneumonia 17.7 0.1
## 366 Rural Male Flu and pneumonia 20.8 0.2
## 367 Urban Female Flu and pneumonia 12.9 0.1
## 368 Rural Female Flu and pneumonia 16.3 0.1
## 417 Urban Male Suicide 19.2 0.1
## 418 Rural Male Suicide 26.3 0.2
## 419 Urban Female Suicide 5.3 0.0
## 420 Rural Female Suicide 6.2 0.1
## 469 Urban Male Nephritis 15.7 0.1
## 470 Rural Male Nephritis 18.3 0.2
## 471 Urban Female Nephritis 10.7 0.0
## 472 Rural Female Nephritis 13.9 0.1
df <- USMortality
summary(df) # summarize the data
## Status Sex Cause Rate
## Rural:20 Female:20 Alzheimers : 4 Min. : 5.30
## Urban:20 Male :20 Cancer : 4 1st Qu.: 18.98
## Cerebrovascular diseases: 4 Median : 30.05
## Diabetes : 4 Mean : 58.73
## Flu and pneumonia : 4 3rd Qu.: 52.90
## Heart disease : 4 Max. :242.70
## (Other) :16
## SE
## Min. :0.00
## 1st Qu.:0.10
## Median :0.20
## Mean :0.18
## 3rd Qu.:0.20
## Max. :0.60
##
dim(df) # count columns and rows
## [1] 40 5
str(df) # Check structure of the data
## 'data.frame': 40 obs. of 5 variables:
## $ Status: Factor w/ 2 levels "Rural","Urban": 2 1 2 1 2 1 2 1 2 1 ...
## $ Sex : Factor w/ 2 levels "Female","Male": 2 2 1 1 2 2 1 1 2 2 ...
## $ Cause : Factor w/ 10 levels "Alzheimers","Cancer",..: 6 6 6 6 2 2 2 2 7 7 ...
## $ Rate : num 210 243 132 155 196 ...
## $ SE : num 0.2 0.6 0.2 0.4 0.2 0.5 0.2 0.4 0.1 0.3 ...
colnames(df) # Check column names
## [1] "Status" "Sex" "Cause" "Rate" "SE"
head(df, 3) # Check the first 3 rows
## Status Sex Cause Rate SE
## 1 Urban Male Heart disease 210.2 0.2
## 2 Rural Male Heart disease 242.7 0.6
## 3 Urban Female Heart disease 132.5 0.2
## Select rows 1:3
df[1:3, ]
## Status Sex Cause Rate SE
## 1 Urban Male Heart disease 210.2 0.2
## 2 Rural Male Heart disease 242.7 0.6
## 3 Urban Female Heart disease 132.5 0.2
## Select rows 1,3,4,6
df[c(1,3,4,6), ]
## Status Sex Cause Rate SE
## 1 Urban Male Heart disease 210.2 0.2
## 3 Urban Female Heart disease 132.5 0.2
## 4 Rural Female Heart disease 154.9 0.4
## 54 Rural Male Cancer 219.3 0.5
## Select columns 1:2
df[, 1:2 ]
## Status Sex
## 1 Urban Male
## 2 Rural Male
## 3 Urban Female
## 4 Rural Female
## 53 Urban Male
## 54 Rural Male
## 55 Urban Female
## 56 Rural Female
## 105 Urban Male
## 106 Rural Male
## 107 Urban Female
## 108 Rural Female
## 157 Urban Male
## 158 Rural Male
## 159 Urban Female
## 160 Rural Female
## 209 Urban Male
## 210 Rural Male
## 211 Urban Female
## 212 Rural Female
## 261 Urban Male
## 262 Rural Male
## 263 Urban Female
## 264 Rural Female
## 313 Urban Male
## 314 Rural Male
## 315 Urban Female
## 316 Rural Female
## 365 Urban Male
## 366 Rural Male
## 367 Urban Female
## 368 Rural Female
## 417 Urban Male
## 418 Rural Male
## 419 Urban Female
## 420 Rural Female
## 469 Urban Male
## 470 Rural Male
## 471 Urban Female
## 472 Rural Female
## Select columns 1,3,4
df[, c(1,3,4)]
## Status Cause Rate
## 1 Urban Heart disease 210.2
## 2 Rural Heart disease 242.7
## 3 Urban Heart disease 132.5
## 4 Rural Heart disease 154.9
## 53 Urban Cancer 195.9
## 54 Rural Cancer 219.3
## 55 Urban Cancer 140.2
## 56 Rural Cancer 150.8
## 105 Urban Lower respiratory 44.5
## 106 Rural Lower respiratory 62.8
## 107 Urban Lower respiratory 36.5
## 108 Rural Lower respiratory 46.9
## 157 Urban Unintentional injuries 49.6
## 158 Rural Unintentional injuries 71.3
## 159 Urban Unintentional injuries 24.7
## 160 Rural Unintentional injuries 37.2
## 209 Urban Cerebrovascular diseases 36.1
## 210 Rural Cerebrovascular diseases 42.2
## 211 Urban Cerebrovascular diseases 34.9
## 212 Rural Cerebrovascular diseases 42.2
## 261 Urban Alzheimers 19.4
## 262 Rural Alzheimers 21.8
## 263 Urban Alzheimers 25.5
## 264 Rural Alzheimers 30.6
## 313 Urban Diabetes 24.9
## 314 Rural Diabetes 29.5
## 315 Urban Diabetes 17.1
## 316 Rural Diabetes 21.8
## 365 Urban Flu and pneumonia 17.7
## 366 Rural Flu and pneumonia 20.8
## 367 Urban Flu and pneumonia 12.9
## 368 Rural Flu and pneumonia 16.3
## 417 Urban Suicide 19.2
## 418 Rural Suicide 26.3
## 419 Urban Suicide 5.3
## 420 Rural Suicide 6.2
## 469 Urban Nephritis 15.7
## 470 Rural Nephritis 18.3
## 471 Urban Nephritis 10.7
## 472 Rural Nephritis 13.9
dim(df)
## [1] 40 5
No <- 1:40
No
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
df$No = No
df
## Status Sex Cause Rate SE No
## 1 Urban Male Heart disease 210.2 0.2 1
## 2 Rural Male Heart disease 242.7 0.6 2
## 3 Urban Female Heart disease 132.5 0.2 3
## 4 Rural Female Heart disease 154.9 0.4 4
## 53 Urban Male Cancer 195.9 0.2 5
## 54 Rural Male Cancer 219.3 0.5 6
## 55 Urban Female Cancer 140.2 0.2 7
## 56 Rural Female Cancer 150.8 0.4 8
## 105 Urban Male Lower respiratory 44.5 0.1 9
## 106 Rural Male Lower respiratory 62.8 0.3 10
## 107 Urban Female Lower respiratory 36.5 0.1 11
## 108 Rural Female Lower respiratory 46.9 0.2 12
## 157 Urban Male Unintentional injuries 49.6 0.1 13
## 158 Rural Male Unintentional injuries 71.3 0.3 14
## 159 Urban Female Unintentional injuries 24.7 0.1 15
## 160 Rural Female Unintentional injuries 37.2 0.2 16
## 209 Urban Male Cerebrovascular diseases 36.1 0.1 17
## 210 Rural Male Cerebrovascular diseases 42.2 0.2 18
## 211 Urban Female Cerebrovascular diseases 34.9 0.1 19
## 212 Rural Female Cerebrovascular diseases 42.2 0.2 20
## 261 Urban Male Alzheimers 19.4 0.1 21
## 262 Rural Male Alzheimers 21.8 0.2 22
## 263 Urban Female Alzheimers 25.5 0.1 23
## 264 Rural Female Alzheimers 30.6 0.2 24
## 313 Urban Male Diabetes 24.9 0.1 25
## 314 Rural Male Diabetes 29.5 0.2 26
## 315 Urban Female Diabetes 17.1 0.1 27
## 316 Rural Female Diabetes 21.8 0.2 28
## 365 Urban Male Flu and pneumonia 17.7 0.1 29
## 366 Rural Male Flu and pneumonia 20.8 0.2 30
## 367 Urban Female Flu and pneumonia 12.9 0.1 31
## 368 Rural Female Flu and pneumonia 16.3 0.1 32
## 417 Urban Male Suicide 19.2 0.1 33
## 418 Rural Male Suicide 26.3 0.2 34
## 419 Urban Female Suicide 5.3 0.0 35
## 420 Rural Female Suicide 6.2 0.1 36
## 469 Urban Male Nephritis 15.7 0.1 37
## 470 Rural Male Nephritis 18.3 0.2 38
## 471 Urban Female Nephritis 10.7 0.0 39
## 472 Rural Female Nephritis 13.9 0.1 40
df$Rate1 <- df$Rate/2
## Import existing data in r
data("midwest")
dim(midwest)
## [1] 437 28
## Show the data
midwest
## # A tibble: 437 × 28
## PID county state area poptotal popdensity popwhite popblack popamerindian
## <int> <chr> <chr> <dbl> <int> <dbl> <int> <int> <int>
## 1 561 ADAMS IL 0.052 66090 1271. 63917 1702 98
## 2 562 ALEXAN… IL 0.014 10626 759 7054 3496 19
## 3 563 BOND IL 0.022 14991 681. 14477 429 35
## 4 564 BOONE IL 0.017 30806 1812. 29344 127 46
## 5 565 BROWN IL 0.018 5836 324. 5264 547 14
## 6 566 BUREAU IL 0.05 35688 714. 35157 50 65
## 7 567 CALHOUN IL 0.017 5322 313. 5298 1 8
## 8 568 CARROLL IL 0.027 16805 622. 16519 111 30
## 9 569 CASS IL 0.024 13437 560. 13384 16 8
## 10 570 CHAMPA… IL 0.058 173025 2983. 146506 16559 331
## # ℹ 427 more rows
## # ℹ 19 more variables: popasian <int>, popother <int>, percwhite <dbl>,
## # percblack <dbl>, percamerindan <dbl>, percasian <dbl>, percother <dbl>,
## # popadults <int>, perchsd <dbl>, percollege <dbl>, percprof <dbl>,
## # poppovertyknown <int>, percpovertyknown <dbl>, percbelowpoverty <dbl>,
## # percchildbelowpovert <dbl>, percadultpoverty <dbl>,
## # percelderlypoverty <dbl>, inmetro <int>, category <chr>
## calculate frequency for column No. 3
midwest |> freq_table(3)
## # A tibble: 5 × 3
## state n prop
## <chr> <int> <dbl>
## 1 IL 102 23.3
## 2 IN 92 21.1
## 3 MI 83 19
## 4 OH 88 20.1
## 5 WI 72 16.5
summary(midwest$poptotal)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1701 18840 35324 96130 75651 5105067
data2 <- midwest |> filter(state %in% c("IL", "MI"))
dim(data2)
## [1] 185 28
data2 |> freq_table(3)
## # A tibble: 2 × 3
## state n prop
## <chr> <int> <dbl>
## 1 IL 102 55.1
## 2 MI 83 44.9
midwest |> group_by(state) |>
summarise(MEAN = mean(poptotal),
SD = sd(poptotal),
se = SD/sqrt(n()))
## # A tibble: 5 × 4
## state MEAN SD se
## <chr> <dbl> <dbl> <dbl>
## 1 IL 112065. 511067. 50603.
## 2 IN 60263. 102068. 10641.
## 3 MI 111992. 272662. 29929.
## 4 OH 123263. 213275. 22735.
## 5 WI 67941. 124129. 14629.
dff <- midwest |> filter(state %in% c("IN", "WI"))
dff$rat <- dff$popblack/dff$popwhite
result <- dff |> group_by(state) |>
summarise(MEAN = mean(rat*100),
SD = sd(rat*100),
se = SD/sqrt(n()))
## Show the result
result
## # A tibble: 2 × 4
## state MEAN SD se
## <chr> <dbl> <dbl> <dbl>
## 1 IN 2.17 5.10 0.532
## 2 WI 0.957 3.50 0.412
dff |> t_test(rat~state)
## # A tibble: 1 × 8
## .y. group1 group2 n1 n2 statistic df p
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 rat IN WI 92 72 1.81 159. 0.0725
write.xlsx(result, file="Mean for ratio.xlsx")
midwest[, c(3:13)] |> group_by(state) |>
get_summary_stats(show = c("mean", "sd", "min", "max"))
## # A tibble: 50 × 7
## state variable n mean sd min max
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 IL area 102 0.032 0.013 0.009 0.068
## 2 IL poptotal 102 112065. 511067. 4373 5105067
## 3 IL popdensity 102 2824. 9608. 199. 88018.
## 4 IL popwhite 102 87774. 326313. 4072 3204947
## 5 IL popblack 102 16611. 130412. 1 1317147
## 6 IL popamerindian 102 214. 1027. 6 10289
## 7 IL popasian 102 2797. 19045. 3 188565
## 8 IL popother 102 4669. 38079. 0 384119
## 9 IL percwhite 102 95.0 7.16 62.8 99.8
## 10 IL percblack 102 3.65 6.36 0.009 32.9
## # ℹ 40 more rows
Note that the echo = FALSE
parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.