Importing data into R is fairly simple. Example of importing data are provided below.
From A Comma Delimited Text File
# first row contains variable names, comma is separator
# assign the variable id to row names
mydata <- read.table("C:\\Users\\Administrator\\Dropbox\\SHI\\Course Materials\\Mindy\\Basic Biostatistics\\Lecture 1\\DIG\\dig_demo.csv", header=TRUE, sep=",")
From Excel
One of the best ways to read an Excel file is to export it to a comma delimited file and import it using the method above. Alternatively you can use the xlsx package to access Excel files. The first row should contain variable/column names.
There are a number of functions for listing the contents of an object or dataset.
# list objects in the working environment
ls()
## [1] "mydata"
# list the variables in mydata
names(mydata)
## [1] "ID" "TRTMT" "AGE" "RACE" "SEX" "EJF_PER"
## [7] "EJFMETH" "CHESTX" "BMI" "FUNCTCLS" "CHFETIOL" "PREVMI"
## [13] "ANGINA" "DIABETES" "HYPERTEN" "DIGDOSE" "CVD" "CVDDAYS"
## [19] "WHF" "WHFDAYS" "DIG" "DIGDAYS" "MI" "MIDAYS"
## [25] "UANG" "UANGDAYS" "STRK" "STRKDAYS" "SVA" "SVADAYS"
## [31] "VENA" "VENADAYS" "CREV" "CREVDAYS" "OCVD" "OCVDDAYS"
## [37] "RINF" "RINFDAYS" "OTH" "OTHDAYS" "HOSP" "HOSPDAYS"
## [43] "NHOSP" "DEATH" "DEATHDAY" "REASON" "DWHF" "DWHFDAYS"
# list the structure of mydata
str(mydata)
## 'data.frame': 6800 obs. of 48 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ TRTMT : Factor w/ 2 levels "Digoxin","Placebo": 2 2 2 1 2 2 1 1 2 1 ...
## $ AGE : int 66 77 72 57 74 69 64 60 74 64 ...
## $ RACE : Factor w/ 2 levels "Non-white","White": 2 2 2 2 2 1 2 1 1 2 ...
## $ SEX : Factor w/ 2 levels "Female","Male": 2 2 1 2 2 1 1 2 2 1 ...
## $ EJF_PER : int 40 12 36 31 15 45 30 39 33 24 ...
## $ EJFMETH : Factor w/ 3 levels "2-D Echo","Angiography",..: 2 3 3 3 3 3 3 3 1 3 ...
## $ CHESTX : num 0.5 0.56 0.68 0.48 0.53 0.7 0.52 0.4 0.49 0.52 ...
## $ BMI : num 20.1 20.7 25.5 25.8 25.7 ...
## $ FUNCTCLS: Factor w/ 5 levels "","I","II","III",..: 2 4 4 3 2 3 4 2 4 3 ...
## $ CHFETIOL: Factor w/ 7 levels "","Alcohol related",..: 5 4 5 3 4 4 5 5 4 5 ...
## $ PREVMI : int 0 1 0 0 0 1 1 0 0 0 ...
## $ ANGINA : int 1 1 1 0 0 0 0 0 1 0 ...
## $ DIABETES: int 1 0 0 0 0 0 0 0 0 0 ...
## $ HYPERTEN: int 0 1 1 1 0 0 0 1 1 1 ...
## $ DIGDOSE : num 0.25 0.25 0.25 0.25 0.375 0.25 0.25 0.25 0.125 0.25 ...
## $ CVD : int 1 1 1 0 1 1 0 0 0 1 ...
## $ CVDDAYS : int 1049 468 631 1157 191 496 903 1369 1747 149 ...
## $ WHF : int 1 1 1 0 1 0 0 0 0 0 ...
## $ WHFDAYS : int 1379 1329 631 1157 191 1620 903 1369 1747 1074 ...
## $ DIG : int 0 0 0 0 0 0 0 0 0 0 ...
## $ DIGDAYS : int 1438 1360 1391 1157 1550 1620 903 1369 1747 1074 ...
## $ MI : int 0 0 0 0 0 0 0 0 0 0 ...
## $ MIDAYS : int 1438 1360 1391 1157 1550 1620 903 1369 1747 1074 ...
## $ UANG : int 0 0 1 0 0 0 0 0 0 1 ...
## $ UANGDAYS: int 1438 1360 746 1157 1550 1620 903 1369 1747 149 ...
## $ STRK : int 0 1 0 0 0 0 0 0 0 0 ...
## $ STRKDAYS: int 1438 468 1391 1157 1550 1620 903 1369 1747 1074 ...
## $ SVA : int 0 0 0 0 0 0 0 0 0 0 ...
## $ SVADAYS : int 1438 1360 1391 1157 1550 1620 903 1369 1747 1074 ...
## $ VENA : int 0 0 0 0 0 1 0 0 0 0 ...
## $ VENADAYS: int 1438 1360 1391 1157 1550 496 903 1369 1747 1074 ...
## $ CREV : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CREVDAYS: int 1438 1360 1391 1157 1550 1620 903 1369 1747 1074 ...
## $ OCVD : int 1 0 0 0 0 0 0 0 0 0 ...
## $ OCVDDAYS: int 1049 1360 1391 1157 1550 1620 903 1369 1747 1074 ...
## $ RINF : int 0 0 0 0 0 0 0 0 0 0 ...
## $ RINFDAYS: int 1438 1360 1391 1157 1550 1620 903 1369 1747 1074 ...
## $ OTH : int 1 1 0 0 1 1 0 0 0 1 ...
## $ OTHDAYS : int 533 880 1391 1157 459 966 903 1369 1747 283 ...
## $ HOSP : int 1 1 1 0 1 1 0 0 0 1 ...
## $ HOSPDAYS: int 533 468 631 1157 191 496 903 1369 1747 149 ...
## $ NHOSP : int 6 4 2 0 5 5 0 0 0 2 ...
## $ DEATH : int 0 1 0 0 0 0 1 0 0 0 ...
## $ DEATHDAY: int 1438 1360 1391 1157 1550 1620 903 1369 1747 1074 ...
## $ REASON : Factor w/ 6 levels "","Non Cardiac Nonvascular",..: 1 6 1 1 1 1 3 1 1 1 ...
## $ DWHF : int 1 1 1 0 1 0 0 0 0 0 ...
## $ DWHFDAYS: int 1379 1329 631 1157 191 1620 903 1369 1747 1074 ...
# list levels of factor v1 in mydata
levels(mydata$TRTMT)
## [1] "Digoxin" "Placebo"
# class of an object (numeric, matrix, data frame, etc)
class(mydata)
## [1] "data.frame"
# print first 10 rows of mydata
head(mydata, n=10)
## ID TRTMT AGE RACE SEX EJF_PER EJFMETH CHESTX BMI
## 1 1 Placebo 66 White Male 40 Angiography 0.50 20.073
## 2 2 Placebo 77 White Male 12 Radionuclide 0.56 20.701
## 3 3 Placebo 72 White Female 36 Radionuclide 0.68 25.530
## 4 4 Digoxin 57 White Male 31 Radionuclide 0.48 25.794
## 5 5 Placebo 74 White Male 15 Radionuclide 0.53 25.654
## 6 6 Placebo 69 Non-white Female 45 Radionuclide 0.70 27.770
## 7 7 Digoxin 64 White Female 30 Radionuclide 0.52 31.694
## 8 8 Digoxin 60 Non-white Male 39 Radionuclide 0.40 25.110
## 9 9 Placebo 74 Non-white Male 33 2-D Echo 0.49 23.688
## 10 10 Digoxin 64 White Female 24 Radionuclide 0.52 28.697
## FUNCTCLS CHFETIOL PREVMI ANGINA DIABETES HYPERTEN DIGDOSE CVD
## 1 I Ischemic 0 1 1 0 0.250 1
## 2 III Idiopathic 1 1 0 1 0.250 1
## 3 III Ischemic 0 1 0 1 0.250 1
## 4 II Hypertensive 0 0 0 1 0.250 0
## 5 I Idiopathic 0 0 0 0 0.375 1
## 6 II Idiopathic 1 0 0 0 0.250 1
## 7 III Ischemic 1 0 0 0 0.250 0
## 8 I Ischemic 0 0 0 1 0.250 0
## 9 III Idiopathic 0 1 0 1 0.125 0
## 10 II Ischemic 0 0 0 1 0.250 1
## CVDDAYS WHF WHFDAYS DIG DIGDAYS MI MIDAYS UANG UANGDAYS STRK STRKDAYS
## 1 1049 1 1379 0 1438 0 1438 0 1438 0 1438
## 2 468 1 1329 0 1360 0 1360 0 1360 1 468
## 3 631 1 631 0 1391 0 1391 1 746 0 1391
## 4 1157 0 1157 0 1157 0 1157 0 1157 0 1157
## 5 191 1 191 0 1550 0 1550 0 1550 0 1550
## 6 496 0 1620 0 1620 0 1620 0 1620 0 1620
## 7 903 0 903 0 903 0 903 0 903 0 903
## 8 1369 0 1369 0 1369 0 1369 0 1369 0 1369
## 9 1747 0 1747 0 1747 0 1747 0 1747 0 1747
## 10 149 0 1074 0 1074 0 1074 1 149 0 1074
## SVA SVADAYS VENA VENADAYS CREV CREVDAYS OCVD OCVDDAYS RINF RINFDAYS OTH
## 1 0 1438 0 1438 0 1438 1 1049 0 1438 1
## 2 0 1360 0 1360 0 1360 0 1360 0 1360 1
## 3 0 1391 0 1391 0 1391 0 1391 0 1391 0
## 4 0 1157 0 1157 0 1157 0 1157 0 1157 0
## 5 0 1550 0 1550 0 1550 0 1550 0 1550 1
## 6 0 1620 1 496 0 1620 0 1620 0 1620 1
## 7 0 903 0 903 0 903 0 903 0 903 0
## 8 0 1369 0 1369 0 1369 0 1369 0 1369 0
## 9 0 1747 0 1747 0 1747 0 1747 0 1747 0
## 10 0 1074 0 1074 0 1074 0 1074 0 1074 1
## OTHDAYS HOSP HOSPDAYS NHOSP DEATH DEATHDAY REASON DWHF
## 1 533 1 533 6 0 1438 1
## 2 880 1 468 4 1 1360 Worsening Heart Failure 1
## 3 1391 1 631 2 0 1391 1
## 4 1157 0 1157 0 0 1157 0
## 5 459 1 191 5 0 1550 1
## 6 966 1 496 5 0 1620 0
## 7 903 0 903 0 1 903 Other Cardiac 0
## 8 1369 0 1369 0 0 1369 0
## 9 1747 0 1747 0 0 1747 0
## 10 283 1 149 2 0 1074 0
## DWHFDAYS
## 1 1379
## 2 1329
## 3 631
## 4 1157
## 5 191
## 6 1620
## 7 903
## 8 1369
## 9 1747
## 10 1074
# print last 5 rows of mydata
tail(mydata, n=5)
## ID TRTMT AGE RACE SEX EJF_PER EJFMETH CHESTX BMI
## 6796 6796 Placebo 73 White Male 23 Radionuclide 0.58 28.975
## 6797 6797 Digoxin 39 White Male 15 Radionuclide 0.46 22.097
## 6798 6798 Digoxin 55 White Male 23 Radionuclide 0.62 26.221
## 6799 6799 Digoxin 56 White Male 29 Radionuclide 0.44 26.651
## 6800 6800 Placebo 70 White Male 45 2-D Echo 0.44 24.374
## FUNCTCLS CHFETIOL PREVMI ANGINA DIABETES HYPERTEN DIGDOSE CVD
## 6796 III Ischemic 0 0 0 1 0.250 1
## 6797 II Hypertensive 0 0 0 1 0.250 0
## 6798 II Idiopathic 0 0 1 1 0.125 1
## 6799 II Ischemic 1 0 0 1 0.250 1
## 6800 II Ischemic 1 0 1 0 0.125 0
## CVDDAYS WHF WHFDAYS DIG DIGDAYS MI MIDAYS UANG UANGDAYS STRK STRKDAYS
## 6796 505 0 546 0 546 1 505 0 546 0 546
## 6797 1419 0 1419 0 1419 0 1419 0 1419 0 1419
## 6798 549 1 549 0 582 0 582 0 582 0 582
## 6799 340 1 515 0 928 0 928 1 340 0 928
## 6800 1738 0 1738 0 1738 0 1738 0 1738 0 1738
## SVA SVADAYS VENA VENADAYS CREV CREVDAYS OCVD OCVDDAYS RINF RINFDAYS
## 6796 0 546 0 546 0 546 0 546 0 546
## 6797 0 1419 0 1419 0 1419 0 1419 0 1419
## 6798 0 582 0 582 0 582 0 582 0 582
## 6799 0 928 0 928 0 928 0 928 0 928
## 6800 0 1738 0 1738 0 1738 0 1738 0 1738
## OTH OTHDAYS HOSP HOSPDAYS NHOSP DEATH DEATHDAY
## 6796 0 546 1 505 1 1 546
## 6797 0 1419 0 1419 0 0 1419
## 6798 0 582 1 549 1 1 582
## 6799 1 572 1 340 6 1 928
## 6800 0 1738 0 1738 0 0 1738
## REASON DWHF DWHFDAYS
## 6796 Worsening Heart Failure 1 546
## 6797 0 1419
## 6798 Other Cardiac 1 549
## 6799 Worsening Heart Failure 1 515
## 6800 0 1738
R provides a wide range of functions for obtaining summary statistics. For example
# mean,median,25th and 75th quartiles,min,max
summary(mydata)
## ID TRTMT AGE RACE
## Min. : 1 Digoxin:3397 Min. :21.00 Non-white: 991
## 1st Qu.:1701 Placebo:3403 1st Qu.:57.00 White :5809
## Median :3400 Median :65.00
## Mean :3400 Mean :63.48
## 3rd Qu.:5100 3rd Qu.:71.00
## Max. :6800 Max. :94.00
##
## SEX EJF_PER EJFMETH CHESTX
## Female:1519 Min. : 3.00 2-D Echo :2025 Min. :0.0500
## Male :5281 1st Qu.:22.00 Angiography : 384 1st Qu.:0.4900
## Median :29.00 Radionuclide:4391 Median :0.5300
## Mean :28.54 Mean :0.5336
## 3rd Qu.:35.00 3rd Qu.:0.5800
## Max. :45.00 Max. :0.9500
## NA's :2
## BMI FUNCTCLS CHFETIOL PREVMI
## Min. :14.45 : 6 : 18 Min. :0.0000
## 1st Qu.:23.68 I : 907 Alcohol related: 222 1st Qu.:0.0000
## Median :26.50 II :3664 Hypertensive : 583 Median :1.0000
## Mean :27.11 III:2081 Idiopathic :1007 Mean :0.6499
## 3rd Qu.:29.80 IV : 142 Ischemic :4803 3rd Qu.:1.0000
## Max. :62.66 Other : 68 Max. :1.0000
## NA's :1 Valvular : 99 NA's :1
## ANGINA DIABETES HYPERTEN DIGDOSE
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.2500
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.2500
## Mean :0.2679 Mean :0.2843 Mean :0.4536 Mean :0.2432
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.2500
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :0.5000
## NA's :2 NA's :1 NA's :1
## CVD CVDDAYS WHF WHFDAYS
## Min. :0.0000 Min. : 1.0 Min. :0.0000 Min. : 1.0
## 1st Qu.:0.0000 1st Qu.: 209.8 1st Qu.:0.0000 1st Qu.: 390.0
## Median :1.0000 Median : 708.5 Median :0.0000 Median : 990.5
## Mean :0.5212 Mean : 734.5 Mean :0.3074 Mean : 896.2
## 3rd Qu.:1.0000 3rd Qu.:1199.0 3rd Qu.:1.0000 3rd Qu.:1349.0
## Max. :1.0000 Max. :1781.0 Max. :1.0000 Max. :1781.0
##
## DIG DIGDAYS MI MIDAYS
## Min. :0.00000 Min. : 1 Min. :0.00000 Min. : 1.0
## 1st Qu.:0.00000 1st Qu.: 821 1st Qu.:0.00000 1st Qu.: 757.8
## Median :0.00000 Median :1147 Median :0.00000 Median :1131.0
## Mean :0.01441 Mean :1056 Mean :0.05824 Mean :1039.1
## 3rd Qu.:0.00000 3rd Qu.:1418 3rd Qu.:0.00000 3rd Qu.:1411.0
## Max. :1.00000 Max. :1781 Max. :1.00000 Max. :1781.0
##
## UANG UANGDAYS STRK STRKDAYS
## Min. :0.0000 Min. : 1.0 Min. :0.00000 Min. : 1
## 1st Qu.:0.0000 1st Qu.: 605.0 1st Qu.:0.00000 1st Qu.: 763
## Median :0.0000 Median :1068.0 Median :0.00000 Median :1130
## Mean :0.1172 Mean : 983.9 Mean :0.04721 Mean :1040
## 3rd Qu.:0.0000 3rd Qu.:1388.0 3rd Qu.:0.00000 3rd Qu.:1411
## Max. :1.0000 Max. :1781.0 Max. :1.00000 Max. :1781
##
## SVA SVADAYS VENA VENADAYS
## Min. :0.00000 Min. : 1.0 Min. :0.00000 Min. : 1.0
## 1st Qu.:0.00000 1st Qu.: 757.8 1st Qu.:0.00000 1st Qu.: 778.8
## Median :0.00000 Median :1129.0 Median :0.00000 Median :1132.0
## Mean :0.04191 Mean :1038.2 Mean :0.04221 Mean :1041.0
## 3rd Qu.:0.00000 3rd Qu.:1411.0 3rd Qu.:0.00000 3rd Qu.:1410.0
## Max. :1.00000 Max. :1781.0 Max. :1.00000 Max. :1781.0
##
## CREV CREVDAYS OCVD OCVDDAYS
## Min. :0.00000 Min. : 1 Min. :0.0000 Min. : 1.0
## 1st Qu.:0.00000 1st Qu.: 793 1st Qu.:0.0000 1st Qu.: 599.0
## Median :0.00000 Median :1137 Median :0.0000 Median :1069.0
## Mean :0.02265 Mean :1048 Mean :0.1257 Mean : 983.4
## 3rd Qu.:0.00000 3rd Qu.:1416 3rd Qu.:0.0000 3rd Qu.:1384.0
## Max. :1.00000 Max. :1781 Max. :1.0000 Max. :1781.0
##
## RINF RINFDAYS OTH OTHDAYS
## Min. :0.00000 Min. : 1.0 Min. :0.0000 Min. : 1
## 1st Qu.:0.00000 1st Qu.: 708.8 1st Qu.:0.0000 1st Qu.: 391
## Median :0.00000 Median :1115.0 Median :0.0000 Median : 929
## Mean :0.07206 Mean :1023.6 Mean :0.3274 Mean : 865
## 3rd Qu.:0.00000 3rd Qu.:1404.0 3rd Qu.:1.0000 3rd Qu.:1307
## Max. :1.00000 Max. :1781.0 Max. :1.0000 Max. :1770
##
## HOSP HOSPDAYS NHOSP DEATH
## Min. :0.0000 Min. : 1.0 Min. : 0.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.: 147.8 1st Qu.: 0.000 1st Qu.:0.0000
## Median :1.0000 Median : 496.0 Median : 1.000 Median :0.0000
## Mean :0.6568 Mean : 624.7 Mean : 1.931 Mean :0.3493
## 3rd Qu.:1.0000 3rd Qu.:1048.0 3rd Qu.: 3.000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1770.0 Max. :39.000 Max. :1.0000
##
## DEATHDAY REASON DWHF
## Min. : 1.0 :4425 Min. :0.0000
## 1st Qu.: 843.5 Non Cardiac Nonvascular: 355 1st Qu.:0.0000
## Median :1152.0 Other Cardiac : 952 Median :0.0000
## Mean :1063.7 Other Vascular : 95 Mean :0.3429
## 3rd Qu.:1423.0 Unknown : 130 3rd Qu.:1.0000
## Max. :1781.0 Worsening Heart Failure: 843 Max. :1.0000
##
## DWHFDAYS
## Min. : 1.0
## 1st Qu.: 390.0
## Median : 990.5
## Mean : 896.2
## 3rd Qu.:1349.0
## Max. :1781.0
##
Summary Statistics by Group
A simple way of generating summary statistics by grouping variable is available in the psych package.
library(psych)
describe.by(mydata$EJF_PER, group=mydata$TRTMT)
##
## Descriptive statistics by group
## group: Digoxin
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 3397 28.63 8.85 29 28.74 10.38 5 45 40 -0.09 -0.79
## se
## X1 0.15
## --------------------------------------------------------
## group: Placebo
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 3403 28.45 8.85 29 28.56 10.38 3 45 42 -0.09 -0.77
## se
## X1 0.15
Generating Frequency Tables
R provides many methods for creating frequency and contingency tables. You can generate frequency tables using the table( ) function, tables of proportions using the prop.table( ) function, and marginal frequencies using margin.table( ).
# 2-Way Frequency Table
mytable = table(mydata$TRTMT, mydata$RACE)
print(mytable)
##
## Non-white White
## Digoxin 487 2910
## Placebo 504 2899
margin.table(mytable, 1)
##
## Digoxin Placebo
## 3397 3403
margin.table(mytable, 2)
##
## Non-white White
## 991 5809
prop.table(mytable) # cell percentages
##
## Non-white White
## Digoxin 0.07161765 0.42794118
## Placebo 0.07411765 0.42632353
prop.table(mytable, 1) # row percentages
##
## Non-white White
## Digoxin 0.1433618 0.8566382
## Placebo 0.1481046 0.8518954
prop.table(mytable, 2) # column percentages
##
## Non-white White
## Digoxin 0.4914228 0.5009468
## Placebo 0.5085772 0.4990532
# 3-Way Frequency Table
mytable <- table(mydata$TRTMT, mydata$RACE, mydata$SEX)
ftable(mytable)
## Female Male
##
## Digoxin Non-white 139 348
## White 616 2294
## Placebo Non-white 148 356
## White 616 2283
The ggplot2 package, created by Hadley Wickham, offers a powerful graphics language for creating elegant and complex plots. Its popularity in the R community has exploded in recent years. ggplot2 allows you to create graphs that represent both univariate and multivariate numerical and categorical data in a straightforward manner. Grouping can be represented by color, symbol, size, and transparency.
Density Chart
library(ggplot2)
p = ggplot(mydata, aes(x=EJF_PER)) + geom_density()
p
# Change line color and fill color
ggplot(mydata, aes(x=EJF_PER))+ geom_density(color="darkblue", fill="lightblue")
# Change line type
ggplot(mydata, aes(x=EJF_PER))+ geom_density(linetype="dashed")
# Change density plot colors by groups
ggplot(mydata, aes(x=EJF_PER, color=SEX)) + geom_density()
# Change fill colors
ggplot(mydata, aes(x=EJF_PER, fill=SEX)) + geom_density()
# Use semi-transparent fill
ggplot(mydata, aes(x=EJF_PER, fill=SEX)) + geom_density(alpha=0.4)
Combine histogram and density plots
# Histogram with density plot
ggplot(mydata, aes(x=EJF_PER)) + geom_histogram(aes(y=..density..), colour="black", fill="white")+geom_density(alpha=.2, fill="#FF6666")
# Color by groups
ggplot(mydata, aes(x=EJF_PER, color=SEX, fill=SEX)) +
geom_histogram(aes(y=..density..), alpha=0.5, position="identity")+ geom_density(alpha=.2)
Use facets
Split the plot in multiple panels:
ggplot(mydata, aes(x=EJF_PER)) + geom_density() + facet_grid(SEX ~ .)
ggplot(mydata, aes(x=EJF_PER)) + geom_density() + facet_grid(. ~ SEX)
ggplot(mydata, aes(x=EJF_PER)) + geom_density() + facet_grid(RACE ~ SEX)
1-sample t test
t.test(mydata$EJF_PER, mu=28)
##
## One Sample t-test
##
## data: mydata$EJF_PER
## t = 5.0379, df = 6799, p-value = 4.828e-07
## alternative hypothesis: true mean is not equal to 28
## 95 percent confidence interval:
## 28.33024 28.75094
## sample estimates:
## mean of x
## 28.54059
2-sample t test
t.test(EJF_PER ~ SEX, data=mydata)
##
## Welch Two Sample t-test
##
## data: EJF_PER by SEX
## t = 7.9669, df = 2428.3, p-value = 2.478e-15
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1.553406 2.567783
## sample estimates:
## mean in group Female mean in group Male
## 30.14088 28.08029
Sample size calculation
library(pwr)
delta = 2
sigma = 8.85
effectsize = delta/sigma
pwr.t.test(d=effectsize, sig.level=0.05, power = 0.8, type='two.sample')
##
## Two-sample t test power calculation
##
## n = 308.3345
## d = 0.2259887
## sig.level = 0.05
## power = 0.8
## alternative = two.sided
##
## NOTE: n is number in *each* group
Power calculation
delta = 2
sigma = 8.85
effectsize = delta/sigma
pwr.t.test(n=400, d= effectsize, sig.level=0.05, type='two.sample')
##
## Two-sample t test power calculation
##
## n = 400
## d = 0.2259887
## sig.level = 0.05
## power = 0.8910537
## alternative = two.sided
##
## NOTE: n is number in *each* group
ggplot(data = data.frame(x = c(-3, 3)), aes(x)) +
stat_function(fun = dnorm, n = 100, args = list(mean = 0, sd = 1))