I am going to do data manipulation to bring out meaningfull insights and visualizations,by use of following data:
1.0 gss_cat
2.0 cars data
3.0 possum data
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(forcats)
library(patchwork)
data(gss_cat)
names(gss_cat)<-str_to_title(names(gss_cat))
gss<-gss_cat
head(gss)
## # A tibble: 6 × 9
## Year Marital Age Race Rincome Partyid Relig Denom Tvhours
## <int> <fct> <int> <fct> <fct> <fct> <fct> <fct> <int>
## 1 2000 Never married 26 White $8000 to 9999 Ind,near r… Prot… Sout… 12
## 2 2000 Divorced 48 White $8000 to 9999 Not str re… Prot… Bapt… NA
## 3 2000 Widowed 67 White Not applicable Independent Prot… No d… 2
## 4 2000 Never married 39 White Not applicable Ind,near r… Orth… Not … 4
## 5 2000 Divorced 25 White Not applicable Not str de… None Not … 1
## 6 2000 Married 25 White $20000 - 24999 Strong dem… Prot… Sout… NA
glimpse(gss)
## Rows: 21,483
## Columns: 9
## $ Year <int> 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 20…
## $ Marital <fct> Never married, Divorced, Widowed, Never married, Divorced, Mar…
## $ Age <int> 26, 48, 67, 39, 25, 25, 36, 44, 44, 47, 53, 52, 52, 51, 52, 40…
## $ Race <fct> White, White, White, White, White, White, White, White, White,…
## $ Rincome <fct> $8000 to 9999, $8000 to 9999, Not applicable, Not applicable, …
## $ Partyid <fct> "Ind,near rep", "Not str republican", "Independent", "Ind,near…
## $ Relig <fct> Protestant, Protestant, Protestant, Orthodox-christian, None, …
## $ Denom <fct> "Southern baptist", "Baptist-dk which", "No denomination", "No…
## $ Tvhours <int> 12, NA, 2, 4, 1, NA, 3, NA, 0, 3, 2, NA, 1, NA, 1, 7, NA, 3, 3…
summary(gss)
## Year Marital Age Race
## Min. :2000 No answer : 17 Min. :18.00 Other : 1959
## 1st Qu.:2002 Never married: 5416 1st Qu.:33.00 Black : 3129
## Median :2006 Separated : 743 Median :46.00 White :16395
## Mean :2007 Divorced : 3383 Mean :47.18 Not applicable: 0
## 3rd Qu.:2010 Widowed : 1807 3rd Qu.:59.00
## Max. :2014 Married :10117 Max. :89.00
## NA's :76
## Rincome Partyid Relig
## $25000 or more:7363 Independent :4119 Protestant:10846
## Not applicable:7043 Not str democrat :3690 Catholic : 5124
## $20000 - 24999:1283 Strong democrat :3490 None : 3523
## $10000 - 14999:1168 Not str republican:3032 Christian : 689
## $15000 - 19999:1048 Ind,near dem :2499 Jewish : 388
## Refused : 975 Strong republican :2314 Other : 224
## (Other) :2603 (Other) :2339 (Other) : 689
## Denom Tvhours
## Not applicable :10072 Min. : 0.000
## Other : 2534 1st Qu.: 1.000
## No denomination : 1683 Median : 2.000
## Southern baptist: 1536 Mean : 2.981
## Baptist-dk which: 1457 3rd Qu.: 4.000
## United methodist: 1067 Max. :24.000
## (Other) : 3134 NA's :10146
names(gss)
## [1] "Year" "Marital" "Age" "Race" "Rincome" "Partyid" "Relig"
## [8] "Denom" "Tvhours"
dim(gss)
## [1] 21483 9
data wrangling
gss%>%pull(Race)%>%unique()
## [1] White Black Other
## Levels: Other Black White Not applicable
class(gss$Race)
## [1] "factor"
count(gss,Race,sort = T)
## # A tibble: 3 × 2
## Race n
## <fct> <int>
## 1 White 16395
## 2 Black 3129
## 3 Other 1959
gss<-gss%>%mutate(Race=fct_drop(Race))
levels(gss$Race)
## [1] "Other" "Black" "White"
gss%>%select(Race)%>%table()%>%barplot()
dim(gss)
## [1] 21483 9
fct_reorder fct_rev
fct_infreq
fct_recode
levels(gss$Partyid)
## [1] "No answer" "Don't know" "Other party"
## [4] "Strong republican" "Not str republican" "Ind,near rep"
## [7] "Independent" "Ind,near dem" "Not str democrat"
## [10] "Strong democrat"
gss%>%mutate(Partyid=fct_recode(Partyid,"No"="No answer","S.r"="Strong republican")) %>%count(Partyid)
## # A tibble: 10 × 2
## Partyid n
## <fct> <int>
## 1 No 154
## 2 Don't know 1
## 3 Other party 393
## 4 S.r 2314
## 5 Not str republican 3032
## 6 Ind,near rep 1791
## 7 Independent 4119
## 8 Ind,near dem 2499
## 9 Not str democrat 3690
## 10 Strong democrat 3490
fct_collapse
gss%>%mutate(Partyid=fct_collapse(Partyid,Others=c("Don't know","No answer","Other party"),Republicans=c("Not str republican","Strong republican"),Independent=c("Ind,near rep","Independent","Ind,near dem"),Democrats=c("Not str democrat","Strong democrat")))%>%count(Partyid)
## # A tibble: 4 × 2
## Partyid n
## <fct> <int>
## 1 Others 548
## 2 Republicans 5346
## 3 Independent 8409
## 4 Democrats 7180
factor lump
gss%>%count(Relig)
## # A tibble: 15 × 2
## Relig n
## <fct> <int>
## 1 No answer 93
## 2 Don't know 15
## 3 Inter-nondenominational 109
## 4 Native american 23
## 5 Christian 689
## 6 Orthodox-christian 95
## 7 Moslem/islam 104
## 8 Other eastern 32
## 9 Hinduism 71
## 10 Buddhism 147
## 11 Other 224
## 12 None 3523
## 13 Jewish 388
## 14 Catholic 5124
## 15 Protestant 10846
gss%>%mutate(Relig=fct_lump(Relig,4))%>%count(Relig)
## # A tibble: 5 × 2
## Relig n
## <fct> <int>
## 1 Christian 689
## 2 None 3523
## 3 Catholic 5124
## 4 Protestant 10846
## 5 Other 1301
reorder
## [1] "No answer" "Never married" "Separated" "Divorced"
## [5] "Widowed" "Married"
## NULL
p<-function(gss){sum(is.na(gss))/length(gss)*100}#missing value percentages
apply(gss,2,p)
## Year Marital Age Race Rincome Partyid Relig
## 0.0000000 0.0000000 0.3537681 0.0000000 0.0000000 0.0000000 0.0000000
## Denom Tvhours
## 0.0000000 47.2280408
#boxplot(gss$Age[gss$Marital=="Widowed"])
2.0 cars data
## 'data.frame': 50 obs. of 2 variables:
## $ speed: num 4 4 7 7 8 9 10 10 10 11 ...
## $ dist : num 2 10 4 22 16 10 18 26 34 17 ...
add column
summary(cars)
## speed dist c time
## Min. : 4.0 Min. : 2.00 high :27 Min. :0.500
## 1st Qu.:12.0 1st Qu.: 26.00 low :15 1st Qu.:1.921
## Median :15.0 Median : 36.00 medium: 8 Median :2.523
## Mean :15.4 Mean : 42.98 Mean :2.632
## 3rd Qu.:19.0 3rd Qu.: 56.00 3rd Qu.:3.186
## Max. :25.0 Max. :120.00 Max. :5.714
lm<-lm(speed~dist,data=cars)
summary(lm)
##
## Call:
## lm(formula = speed ~ dist, data = cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.5293 -2.1550 0.3615 2.4377 6.4179
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.28391 0.87438 9.474 1.44e-12 ***
## dist 0.16557 0.01749 9.464 1.49e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.156 on 48 degrees of freedom
## Multiple R-squared: 0.6511, Adjusted R-squared: 0.6438
## F-statistic: 89.57 on 1 and 48 DF, p-value: 1.49e-12
groupby and summarise
p<-cars%>%group_by(c)%>%count()
p
## # A tibble: 3 × 2
## # Groups: c [3]
## c n
## <fct> <int>
## 1 high 27
## 2 low 15
## 3 medium 8
plot
cars%>%summarise_at(.vars=vars(speed,dist),.funs = mean)
## speed dist
## 1 15.4 42.98
2.0 possum data
library(DAAG)
library(stringr)
data(possum)
names(possum)<-str_to_title(names(possum))
possum$Site<-as.factor(possum$Site)
glimpse(possum)
## Rows: 104
## Columns: 14
## $ Case <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ Site <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Pop <fct> Vic, Vic, Vic, Vic, Vic, Vic, Vic, Vic, Vic, Vic, Vic, Vic, V…
## $ Sex <fct> m, f, f, f, f, f, m, f, f, f, f, f, m, m, m, m, f, m, f, f, f…
## $ Age <dbl> 8, 6, 6, 6, 2, 1, 2, 6, 9, 6, 9, 5, 5, 3, 5, 4, 1, 2, 5, 4, 3…
## $ Hdlngth <dbl> 94.1, 92.5, 94.0, 93.2, 91.5, 93.1, 95.3, 94.8, 93.4, 91.8, 9…
## $ Skullw <dbl> 60.4, 57.6, 60.0, 57.1, 56.3, 54.8, 58.2, 57.6, 56.3, 58.0, 5…
## $ Totlngth <dbl> 89.0, 91.5, 95.5, 92.0, 85.5, 90.5, 89.5, 91.0, 91.5, 89.5, 8…
## $ Taill <dbl> 36.0, 36.5, 39.0, 38.0, 36.0, 35.5, 36.0, 37.0, 37.0, 37.5, 3…
## $ Footlgth <dbl> 74.5, 72.5, 75.4, 76.1, 71.0, 73.2, 71.5, 72.7, 72.4, 70.9, 7…
## $ Earconch <dbl> 54.5, 51.2, 51.9, 52.2, 53.2, 53.6, 52.0, 53.9, 52.9, 53.4, 5…
## $ Eye <dbl> 15.2, 16.0, 15.5, 15.2, 15.1, 14.2, 14.2, 14.5, 15.5, 14.4, 1…
## $ Chest <dbl> 28.0, 28.5, 30.0, 28.0, 28.5, 30.0, 30.0, 29.0, 28.0, 27.5, 3…
## $ Belly <dbl> 36.0, 33.0, 34.0, 34.0, 33.0, 32.0, 34.5, 34.0, 33.0, 32.0, 3…
select some columns
head(possum%>%select(1:3,10:14))
## Case Site Pop Footlgth Earconch Eye Chest Belly
## C3 1 1 Vic 74.5 54.5 15.2 28.0 36
## C5 2 1 Vic 72.5 51.2 16.0 28.5 33
## C10 3 1 Vic 75.4 51.9 15.5 30.0 34
## C15 4 1 Vic 76.1 52.2 15.2 28.0 34
## C23 5 1 Vic 71.0 53.2 15.1 28.5 33
## C24 6 1 Vic 73.2 53.6 14.2 30.0 32
deselect some columns
head(possum%>%select(c(-1,-2,-3)))
## Sex Age Hdlngth Skullw Totlngth Taill Footlgth Earconch Eye Chest Belly
## C3 m 8 94.1 60.4 89.0 36.0 74.5 54.5 15.2 28.0 36
## C5 f 6 92.5 57.6 91.5 36.5 72.5 51.2 16.0 28.5 33
## C10 f 6 94.0 60.0 95.5 39.0 75.4 51.9 15.5 30.0 34
## C15 f 6 93.2 57.1 92.0 38.0 76.1 52.2 15.2 28.0 34
## C23 f 2 91.5 56.3 85.5 36.0 71.0 53.2 15.1 28.5 33
## C24 f 1 93.1 54.8 90.5 35.5 73.2 53.6 14.2 30.0 32
filter and arrange
head(possum)
## Case Site Pop Sex Age Hdlngth Skullw Totlngth Taill Footlgth Earconch Eye
## C3 1 1 Vic m 8 94.1 60.4 89.0 36.0 74.5 54.5 15.2
## C5 2 1 Vic f 6 92.5 57.6 91.5 36.5 72.5 51.2 16.0
## C10 3 1 Vic f 6 94.0 60.0 95.5 39.0 75.4 51.9 15.5
## C15 4 1 Vic f 6 93.2 57.1 92.0 38.0 76.1 52.2 15.2
## C23 5 1 Vic f 2 91.5 56.3 85.5 36.0 71.0 53.2 15.1
## C24 6 1 Vic f 1 93.1 54.8 90.5 35.5 73.2 53.6 14.2
## Chest Belly
## C3 28.0 36
## C5 28.5 33
## C10 30.0 34
## C15 28.0 34
## C23 28.5 33
## C24 30.0 32
head(possum%>%filter(Sex=="f",Pop=="Vic",Age<14)%>%arrange(Age))
## Case Site Pop Sex Age Hdlngth Skullw Totlngth Taill Footlgth Earconch Eye
## C24 6 1 Vic f 1 93.1 54.8 90.5 35.5 73.2 53.6 14.2
## C45 17 1 Vic f 1 94.7 67.7 89.5 36.5 73.2 53.2 14.7
## BB31 39 2 Vic f 1 84.7 51.5 75.0 34.0 68.7 53.4 13.0
## C23 5 1 Vic f 2 91.5 56.3 85.5 36.0 71.0 53.2 15.1
## C63 27 1 Vic f 2 90.5 54.5 85.0 35.0 70.3 50.8 14.2
## A2 30 1 Vic f 2 92.1 54.4 84.0 33.5 70.6 50.8 14.5
## Chest Belly
## C24 30.0 32
## C45 29.0 31
## BB31 25.0 25
## C23 28.5 33
## C63 23.0 28
## A2 24.5 33
binary logistic regresion
library(gtsummary)
model<-glm(Sex~.,data=possum,family="binomial")%>%tbl_regression(exponentiate = TRUE)
model
| Characteristic | OR1 | 95% CI1 | p-value |
|---|---|---|---|
| Case | 1.04 | 0.96, 1.13 | 0.3 |
| Site | |||
|     1 | — | — | |
| Â Â Â Â 2 | 0.91 | 0.07, 13.8 | >0.9 |
| Â Â Â Â 3 | 3.02 | 0.04, 239 | 0.6 |
| Â Â Â Â 4 | 7.59 | 0.05, 1,279 | 0.4 |
| Â Â Â Â 5 | 4.04 | 0.02, 865 | 0.6 |
| Â Â Â Â 6 | 8.90 | 0.02, 4,536 | 0.5 |
| Â Â Â Â 7 | 3.11 | 0.00, 2,867 | 0.7 |
| Pop | |||
|     Vic | — | — | |
| Â Â Â Â other | |||
| Age | 0.98 | 0.72, 1.31 | 0.9 |
| Hdlngth | 1.26 | 0.96, 1.68 | 0.10 |
| Skullw | 1.07 | 0.83, 1.44 | 0.6 |
| Totlngth | 0.88 | 0.67, 1.13 | 0.3 |
| Taill | 0.81 | 0.50, 1.28 | 0.4 |
| Footlgth | 1.12 | 0.86, 1.47 | 0.4 |
| Earconch | 1.27 | 0.90, 1.83 | 0.2 |
| Eye | 1.82 | 1.04, 3.42 | 0.045 |
| Chest | 0.91 | 0.61, 1.33 | 0.6 |
| Belly | 0.90 | 0.70, 1.15 | 0.4 |
| 1 OR = Odds Ratio, CI = Confidence Interval | |||
group by
possum%>%filter(Sex=="m")%>%group_by(Site)%>%summarise(Avg=mean(Belly),sd=sd(Belly),count=n())%>%arrange(desc(Site))
## # A tibble: 7 × 4
## Site Avg sd count
## <fct> <dbl> <dbl> <int>
## 1 7 31.8 2.25 14
## 2 6 31.5 2.78 9
## 3 5 30.9 2.28 7
## 4 4 34.6 2.22 5
## 5 3 34 1.47 4
## 6 2 32.1 3.37 8
## 7 1 33.2 2.49 14
dim(possum%>%filter(Sex=="m"))
## [1] 61 14
mutate
library(dplyr)
p<-possum%>%filter(Sex=="m")%>%group_by(Site)%>%reframe(TR=mean(Taill/Eye),count=n())%>%arrange(desc(Site))
p
## # A tibble: 7 × 3
## Site TR count
## <fct> <dbl> <int>
## 1 7 2.59 14
## 2 6 2.42 9
## 3 5 2.41 7
## 4 4 2.50 5
## 5 3 2.34 4
## 6 2 2.35 8
## 7 1 2.36 14
histogram with density
interactions
possum%>%count(Sex,Age)%>%group_by(Age)%>%mutate(proportion=n/sum(n))%>%ggplot(aes(Age,proportion,color=Sex))+geom_line()+theme_minimal()
## Warning: Removed 1 row containing missing values (`geom_line()`).
marginal plots
library(ggpubr)
library(ggalt)
p<-possum%>%ggplot(aes(Skullw,Hdlngth))+geom_point()+geom_smooth(method="lm",se=0)+geom_encircle(data=possum,aes(Skullw,Hdlngth,expand=0))+ggtitle("scatterplot:hdlength vs skullw")+theme_minimal()
library(av)
library(gifski)
#p+transition_speed(speed)
#p+ggMarginal(p,type='histogram')
library(stringr)
library(gapminder)
names(gapminder)<-str_to_title(names(gapminder))
names(gapminder)
## [1] "Country" "Continent" "Year" "Lifeexp" "Pop" "Gdppercap"
p<-ggplot(gapminder,aes(Gdppercap,Lifeexp))+geom_point()
#p+transition_time(Year)
#area=2*pi*r*h+2*pi*r^2#cylinder area
cylinder<-function(r,h,area){area<-2*pi*r*h
return(cbind(area))}
cylinder(2,2)
## area
## [1,] 25.13274
cylinder(1:10,20)
## area
## [1,] 125.6637
## [2,] 251.3274
## [3,] 376.9911
## [4,] 502.6548
## [5,] 628.3185
## [6,] 753.9822
## [7,] 879.6459
## [8,] 1005.3096
## [9,] 1130.9734
## [10,] 1256.6371
cylinder(1:10,20)
## area
## [1,] 125.6637
## [2,] 251.3274
## [3,] 376.9911
## [4,] 502.6548
## [5,] 628.3185
## [6,] 753.9822
## [7,] 879.6459
## [8,] 1005.3096
## [9,] 1130.9734
## [10,] 1256.6371
#change to a matrix
cylinder<-function(r,h,area){area<-2*pi*r*h
return(cbind(r,h,area))}
cylinder(1:10,20:29)
## r h area
## [1,] 1 20 125.6637
## [2,] 2 21 263.8938
## [3,] 3 22 414.6902
## [4,] 4 23 578.0530
## [5,] 5 24 753.9822
## [6,] 6 25 942.4778
## [7,] 7 26 1143.5397
## [8,] 8 27 1357.1680
## [9,] 9 28 1583.3627
## [10,] 10 29 1822.1237
cylinder<-function(r,h,area){area<-2*pi*h
return (data.frame(r,h,area))}
cylinder(2:20,42:60)
## r h area
## 1 2 42 263.8938
## 2 3 43 270.1770
## 3 4 44 276.4602
## 4 5 45 282.7433
## 5 6 46 289.0265
## 6 7 47 295.3097
## 7 8 48 301.5929
## 8 9 49 307.8761
## 9 10 50 314.1593
## 10 11 51 320.4425
## 11 12 52 326.7256
## 12 13 53 333.0088
## 13 14 54 339.2920
## 14 15 55 345.5752
## 15 16 56 351.8584
## 16 17 57 358.1416
## 17 18 58 364.4247
## 18 19 59 370.7079
## 19 20 60 376.9911