Descriptive statistics

Import library

library(tidyr) # Clean data
library(dplyr) # Manage data

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(rstatix) # Analyze data (descriptive/t-test)

## 
## Attaching package: 'rstatix'

## The following object is masked from 'package:stats':
## 
##     filter

library(rio) # Import data from Excel
library(ggplot2) # Create graphics
library(openxlsx) # Save result to Excel
library(agricolae) # Analyze data
library(lattice) # Advanced graphics

Import data

In R, there are exisitng data, we just type data(); then data will pop up and we can use it.

## Import data 
data("USMortality")

## Show the data
USMortality

##     Status    Sex                    Cause  Rate  SE
## 1    Urban   Male            Heart disease 210.2 0.2
## 2    Rural   Male            Heart disease 242.7 0.6
## 3    Urban Female            Heart disease 132.5 0.2
## 4    Rural Female            Heart disease 154.9 0.4
## 53   Urban   Male                   Cancer 195.9 0.2
## 54   Rural   Male                   Cancer 219.3 0.5
## 55   Urban Female                   Cancer 140.2 0.2
## 56   Rural Female                   Cancer 150.8 0.4
## 105  Urban   Male        Lower respiratory  44.5 0.1
## 106  Rural   Male        Lower respiratory  62.8 0.3
## 107  Urban Female        Lower respiratory  36.5 0.1
## 108  Rural Female        Lower respiratory  46.9 0.2
## 157  Urban   Male   Unintentional injuries  49.6 0.1
## 158  Rural   Male   Unintentional injuries  71.3 0.3
## 159  Urban Female   Unintentional injuries  24.7 0.1
## 160  Rural Female   Unintentional injuries  37.2 0.2
## 209  Urban   Male Cerebrovascular diseases  36.1 0.1
## 210  Rural   Male Cerebrovascular diseases  42.2 0.2
## 211  Urban Female Cerebrovascular diseases  34.9 0.1
## 212  Rural Female Cerebrovascular diseases  42.2 0.2
## 261  Urban   Male               Alzheimers  19.4 0.1
## 262  Rural   Male               Alzheimers  21.8 0.2
## 263  Urban Female               Alzheimers  25.5 0.1
## 264  Rural Female               Alzheimers  30.6 0.2
## 313  Urban   Male                 Diabetes  24.9 0.1
## 314  Rural   Male                 Diabetes  29.5 0.2
## 315  Urban Female                 Diabetes  17.1 0.1
## 316  Rural Female                 Diabetes  21.8 0.2
## 365  Urban   Male        Flu and pneumonia  17.7 0.1
## 366  Rural   Male        Flu and pneumonia  20.8 0.2
## 367  Urban Female        Flu and pneumonia  12.9 0.1
## 368  Rural Female        Flu and pneumonia  16.3 0.1
## 417  Urban   Male                  Suicide  19.2 0.1
## 418  Rural   Male                  Suicide  26.3 0.2
## 419  Urban Female                  Suicide   5.3 0.0
## 420  Rural Female                  Suicide   6.2 0.1
## 469  Urban   Male                Nephritis  15.7 0.1
## 470  Rural   Male                Nephritis  18.3 0.2
## 471  Urban Female                Nephritis  10.7 0.0
## 472  Rural Female                Nephritis  13.9 0.1

Rename the data

df <- USMortality

Check the data

summary(df) # summarize the data

##    Status       Sex                          Cause         Rate       
##  Rural:20   Female:20   Alzheimers              : 4   Min.   :  5.30  
##  Urban:20   Male  :20   Cancer                  : 4   1st Qu.: 18.98  
##                         Cerebrovascular diseases: 4   Median : 30.05  
##                         Diabetes                : 4   Mean   : 58.73  
##                         Flu and pneumonia       : 4   3rd Qu.: 52.90  
##                         Heart disease           : 4   Max.   :242.70  
##                         (Other)                 :16                   
##        SE      
##  Min.   :0.00  
##  1st Qu.:0.10  
##  Median :0.20  
##  Mean   :0.18  
##  3rd Qu.:0.20  
##  Max.   :0.60  
##

dim(df) # count columns and rows

## [1] 40  5

str(df) # Check structure of the data

## 'data.frame':    40 obs. of  5 variables:
##  $ Status: Factor w/ 2 levels "Rural","Urban": 2 1 2 1 2 1 2 1 2 1 ...
##  $ Sex   : Factor w/ 2 levels "Female","Male": 2 2 1 1 2 2 1 1 2 2 ...
##  $ Cause : Factor w/ 10 levels "Alzheimers","Cancer",..: 6 6 6 6 2 2 2 2 7 7 ...
##  $ Rate  : num  210 243 132 155 196 ...
##  $ SE    : num  0.2 0.6 0.2 0.4 0.2 0.5 0.2 0.4 0.1 0.3 ...

colnames(df) # Check column names

## [1] "Status" "Sex"    "Cause"  "Rate"   "SE"

head(df, 3) # Check the first 3 rows

##   Status    Sex         Cause  Rate  SE
## 1  Urban   Male Heart disease 210.2 0.2
## 2  Rural   Male Heart disease 242.7 0.6
## 3  Urban Female Heart disease 132.5 0.2

select specific rows and columns

## Select rows 1:3
df[1:3, ]

##   Status    Sex         Cause  Rate  SE
## 1  Urban   Male Heart disease 210.2 0.2
## 2  Rural   Male Heart disease 242.7 0.6
## 3  Urban Female Heart disease 132.5 0.2

## Select rows 1,3,4,6
df[c(1,3,4,6), ]

##    Status    Sex         Cause  Rate  SE
## 1   Urban   Male Heart disease 210.2 0.2
## 3   Urban Female Heart disease 132.5 0.2
## 4   Rural Female Heart disease 154.9 0.4
## 54  Rural   Male        Cancer 219.3 0.5

## Select columns 1:2
df[, 1:2 ]

##     Status    Sex
## 1    Urban   Male
## 2    Rural   Male
## 3    Urban Female
## 4    Rural Female
## 53   Urban   Male
## 54   Rural   Male
## 55   Urban Female
## 56   Rural Female
## 105  Urban   Male
## 106  Rural   Male
## 107  Urban Female
## 108  Rural Female
## 157  Urban   Male
## 158  Rural   Male
## 159  Urban Female
## 160  Rural Female
## 209  Urban   Male
## 210  Rural   Male
## 211  Urban Female
## 212  Rural Female
## 261  Urban   Male
## 262  Rural   Male
## 263  Urban Female
## 264  Rural Female
## 313  Urban   Male
## 314  Rural   Male
## 315  Urban Female
## 316  Rural Female
## 365  Urban   Male
## 366  Rural   Male
## 367  Urban Female
## 368  Rural Female
## 417  Urban   Male
## 418  Rural   Male
## 419  Urban Female
## 420  Rural Female
## 469  Urban   Male
## 470  Rural   Male
## 471  Urban Female
## 472  Rural Female

## Select columns 1,3,4
df[, c(1,3,4)]

##     Status                    Cause  Rate
## 1    Urban            Heart disease 210.2
## 2    Rural            Heart disease 242.7
## 3    Urban            Heart disease 132.5
## 4    Rural            Heart disease 154.9
## 53   Urban                   Cancer 195.9
## 54   Rural                   Cancer 219.3
## 55   Urban                   Cancer 140.2
## 56   Rural                   Cancer 150.8
## 105  Urban        Lower respiratory  44.5
## 106  Rural        Lower respiratory  62.8
## 107  Urban        Lower respiratory  36.5
## 108  Rural        Lower respiratory  46.9
## 157  Urban   Unintentional injuries  49.6
## 158  Rural   Unintentional injuries  71.3
## 159  Urban   Unintentional injuries  24.7
## 160  Rural   Unintentional injuries  37.2
## 209  Urban Cerebrovascular diseases  36.1
## 210  Rural Cerebrovascular diseases  42.2
## 211  Urban Cerebrovascular diseases  34.9
## 212  Rural Cerebrovascular diseases  42.2
## 261  Urban               Alzheimers  19.4
## 262  Rural               Alzheimers  21.8
## 263  Urban               Alzheimers  25.5
## 264  Rural               Alzheimers  30.6
## 313  Urban                 Diabetes  24.9
## 314  Rural                 Diabetes  29.5
## 315  Urban                 Diabetes  17.1
## 316  Rural                 Diabetes  21.8
## 365  Urban        Flu and pneumonia  17.7
## 366  Rural        Flu and pneumonia  20.8
## 367  Urban        Flu and pneumonia  12.9
## 368  Rural        Flu and pneumonia  16.3
## 417  Urban                  Suicide  19.2
## 418  Rural                  Suicide  26.3
## 419  Urban                  Suicide   5.3
## 420  Rural                  Suicide   6.2
## 469  Urban                Nephritis  15.7
## 470  Rural                Nephritis  18.3
## 471  Urban                Nephritis  10.7
## 472  Rural                Nephritis  13.9

Add a new number

dim(df)

## [1] 40  5

No <- 1:40
No

##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40

df$No = No
df

##     Status    Sex                    Cause  Rate  SE No
## 1    Urban   Male            Heart disease 210.2 0.2  1
## 2    Rural   Male            Heart disease 242.7 0.6  2
## 3    Urban Female            Heart disease 132.5 0.2  3
## 4    Rural Female            Heart disease 154.9 0.4  4
## 53   Urban   Male                   Cancer 195.9 0.2  5
## 54   Rural   Male                   Cancer 219.3 0.5  6
## 55   Urban Female                   Cancer 140.2 0.2  7
## 56   Rural Female                   Cancer 150.8 0.4  8
## 105  Urban   Male        Lower respiratory  44.5 0.1  9
## 106  Rural   Male        Lower respiratory  62.8 0.3 10
## 107  Urban Female        Lower respiratory  36.5 0.1 11
## 108  Rural Female        Lower respiratory  46.9 0.2 12
## 157  Urban   Male   Unintentional injuries  49.6 0.1 13
## 158  Rural   Male   Unintentional injuries  71.3 0.3 14
## 159  Urban Female   Unintentional injuries  24.7 0.1 15
## 160  Rural Female   Unintentional injuries  37.2 0.2 16
## 209  Urban   Male Cerebrovascular diseases  36.1 0.1 17
## 210  Rural   Male Cerebrovascular diseases  42.2 0.2 18
## 211  Urban Female Cerebrovascular diseases  34.9 0.1 19
## 212  Rural Female Cerebrovascular diseases  42.2 0.2 20
## 261  Urban   Male               Alzheimers  19.4 0.1 21
## 262  Rural   Male               Alzheimers  21.8 0.2 22
## 263  Urban Female               Alzheimers  25.5 0.1 23
## 264  Rural Female               Alzheimers  30.6 0.2 24
## 313  Urban   Male                 Diabetes  24.9 0.1 25
## 314  Rural   Male                 Diabetes  29.5 0.2 26
## 315  Urban Female                 Diabetes  17.1 0.1 27
## 316  Rural Female                 Diabetes  21.8 0.2 28
## 365  Urban   Male        Flu and pneumonia  17.7 0.1 29
## 366  Rural   Male        Flu and pneumonia  20.8 0.2 30
## 367  Urban Female        Flu and pneumonia  12.9 0.1 31
## 368  Rural Female        Flu and pneumonia  16.3 0.1 32
## 417  Urban   Male                  Suicide  19.2 0.1 33
## 418  Rural   Male                  Suicide  26.3 0.2 34
## 419  Urban Female                  Suicide   5.3 0.0 35
## 420  Rural Female                  Suicide   6.2 0.1 36
## 469  Urban   Male                Nephritis  15.7 0.1 37
## 470  Rural   Male                Nephritis  18.3 0.2 38
## 471  Urban Female                Nephritis  10.7 0.0 39
## 472  Rural Female                Nephritis  13.9 0.1 40

Add a new column (Rate/2)

df$Rate1 <- df$Rate/2

Descriptive statistics

## Import existing data in r
data("midwest")

dim(midwest)

## [1] 437  28

## Show the data
midwest

## # A tibble: 437 × 28
##      PID county  state  area poptotal popdensity popwhite popblack popamerindian
##    <int> <chr>   <chr> <dbl>    <int>      <dbl>    <int>    <int>         <int>
##  1   561 ADAMS   IL    0.052    66090      1271.    63917     1702            98
##  2   562 ALEXAN… IL    0.014    10626       759      7054     3496            19
##  3   563 BOND    IL    0.022    14991       681.    14477      429            35
##  4   564 BOONE   IL    0.017    30806      1812.    29344      127            46
##  5   565 BROWN   IL    0.018     5836       324.     5264      547            14
##  6   566 BUREAU  IL    0.05     35688       714.    35157       50            65
##  7   567 CALHOUN IL    0.017     5322       313.     5298        1             8
##  8   568 CARROLL IL    0.027    16805       622.    16519      111            30
##  9   569 CASS    IL    0.024    13437       560.    13384       16             8
## 10   570 CHAMPA… IL    0.058   173025      2983.   146506    16559           331
## # ℹ 427 more rows
## # ℹ 19 more variables: popasian <int>, popother <int>, percwhite <dbl>,
## #   percblack <dbl>, percamerindan <dbl>, percasian <dbl>, percother <dbl>,
## #   popadults <int>, perchsd <dbl>, percollege <dbl>, percprof <dbl>,
## #   poppovertyknown <int>, percpovertyknown <dbl>, percbelowpoverty <dbl>,
## #   percchildbelowpovert <dbl>, percadultpoverty <dbl>,
## #   percelderlypoverty <dbl>, inmetro <int>, category <chr>

Calculate frequency and percentage

## calculate frequency for column No. 3
midwest |> freq_table(3)

## # A tibble: 5 × 3
##   state     n  prop
##   <chr> <int> <dbl>
## 1 IL      102  23.3
## 2 IN       92  21.1
## 3 MI       83  19  
## 4 OH       88  20.1
## 5 WI       72  16.5

summarize poptotal

summary(midwest$poptotal)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1701   18840   35324   96130   75651 5105067

Select only IL and MI

data2 <- midwest |> filter(state %in% c("IL", "MI"))
dim(data2)

## [1] 185  28

data2 |> freq_table(3)

## # A tibble: 2 × 3
##   state     n  prop
##   <chr> <int> <dbl>
## 1 IL      102  55.1
## 2 MI       83  44.9

Calculate mean, sd, se by state

midwest |> group_by(state) |>
  summarise(MEAN = mean(poptotal),
            SD = sd(poptotal),
            se = SD/sqrt(n()))

## # A tibble: 5 × 4
##   state    MEAN      SD     se
##   <chr>   <dbl>   <dbl>  <dbl>
## 1 IL    112065. 511067. 50603.
## 2 IN     60263. 102068. 10641.
## 3 MI    111992. 272662. 29929.
## 4 OH    123263. 213275. 22735.
## 5 WI     67941. 124129. 14629.

Calculate mean for white/black ratio for state IN&WI

dff <- midwest |> filter(state %in% c("IN", "WI"))

dff$rat <- dff$popblack/dff$popwhite

result <- dff |> group_by(state) |>
  summarise(MEAN = mean(rat*100),
            SD = sd(rat*100),
            se = SD/sqrt(n()))

## Show the result
result

## # A tibble: 2 × 4
##   state  MEAN    SD    se
##   <chr> <dbl> <dbl> <dbl>
## 1 IN    2.17   5.10 0.532
## 2 WI    0.957  3.50 0.412

t-test and save result to Excel

dff |> t_test(rat~state)

## # A tibble: 1 × 8
##   .y.   group1 group2    n1    n2 statistic    df      p
## * <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl>  <dbl>
## 1 rat   IN     WI        92    72      1.81  159. 0.0725

Save the result into MS Excel

write.xlsx(result, file="Mean for ratio.xlsx")

Calculate means for columns 4:15

midwest[, c(3:13)] |> group_by(state) |>
  get_summary_stats(show = c("mean", "sd", "min", "max"))

## # A tibble: 50 × 7
##    state variable          n       mean         sd      min         max
##    <chr> <fct>         <dbl>      <dbl>      <dbl>    <dbl>       <dbl>
##  1 IL    area            102      0.032      0.013    0.009       0.068
##  2 IL    poptotal        102 112065.    511067.    4373     5105067    
##  3 IL    popdensity      102   2824.      9608.     199.      88018.   
##  4 IL    popwhite        102  87774.    326313.    4072     3204947    
##  5 IL    popblack        102  16611.    130412.       1     1317147    
##  6 IL    popamerindian   102    214.      1027.       6       10289    
##  7 IL    popasian        102   2797.     19045.       3      188565    
##  8 IL    popother        102   4669.     38079.       0      384119    
##  9 IL    percwhite       102     95.0        7.16    62.8        99.8  
## 10 IL    percblack       102      3.65       6.36     0.009      32.9  
## # ℹ 40 more rows

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Descriptive statistics

Lyhour Hin

2024-12-31

Import library

Import data

In R, there are exisitng data, we just type data(); then data will pop up and we can use it.

Rename the data

Check the data

select specific rows and columns

Add a new number

Add a new column (Rate/2)

Descriptive statistics

Calculate frequency and percentage

summarize poptotal

Select only IL and MI

Calculate mean, sd, se by state

Calculate mean for white/black ratio for state IN&WI

t-test and save result to Excel

Save the result into MS Excel

Calculate means for columns 4:15