Data_Manipulation

I am going to do data manipulation to bring out meaningfull insights and visualizations,by use of following data:

1.0 gss_cat

2.0 cars data

3.0 possum data

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(forcats)
library(patchwork)
data(gss_cat)
names(gss_cat)<-str_to_title(names(gss_cat))
gss<-gss_cat
head(gss)

## # A tibble: 6 × 9
##    Year Marital         Age Race  Rincome        Partyid     Relig Denom Tvhours
##   <int> <fct>         <int> <fct> <fct>          <fct>       <fct> <fct>   <int>
## 1  2000 Never married    26 White $8000 to 9999  Ind,near r… Prot… Sout…      12
## 2  2000 Divorced         48 White $8000 to 9999  Not str re… Prot… Bapt…      NA
## 3  2000 Widowed          67 White Not applicable Independent Prot… No d…       2
## 4  2000 Never married    39 White Not applicable Ind,near r… Orth… Not …       4
## 5  2000 Divorced         25 White Not applicable Not str de… None  Not …       1
## 6  2000 Married          25 White $20000 - 24999 Strong dem… Prot… Sout…      NA

glimpse(gss)

## Rows: 21,483
## Columns: 9
## $ Year    <int> 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 20…
## $ Marital <fct> Never married, Divorced, Widowed, Never married, Divorced, Mar…
## $ Age     <int> 26, 48, 67, 39, 25, 25, 36, 44, 44, 47, 53, 52, 52, 51, 52, 40…
## $ Race    <fct> White, White, White, White, White, White, White, White, White,…
## $ Rincome <fct> $8000 to 9999, $8000 to 9999, Not applicable, Not applicable, …
## $ Partyid <fct> "Ind,near rep", "Not str republican", "Independent", "Ind,near…
## $ Relig   <fct> Protestant, Protestant, Protestant, Orthodox-christian, None, …
## $ Denom   <fct> "Southern baptist", "Baptist-dk which", "No denomination", "No…
## $ Tvhours <int> 12, NA, 2, 4, 1, NA, 3, NA, 0, 3, 2, NA, 1, NA, 1, 7, NA, 3, 3…

summary(gss)

##       Year               Marital           Age                    Race      
##  Min.   :2000   No answer    :   17   Min.   :18.00   Other         : 1959  
##  1st Qu.:2002   Never married: 5416   1st Qu.:33.00   Black         : 3129  
##  Median :2006   Separated    :  743   Median :46.00   White         :16395  
##  Mean   :2007   Divorced     : 3383   Mean   :47.18   Not applicable:    0  
##  3rd Qu.:2010   Widowed      : 1807   3rd Qu.:59.00                         
##  Max.   :2014   Married      :10117   Max.   :89.00                         
##                                       NA's   :76                            
##            Rincome                   Partyid            Relig      
##  $25000 or more:7363   Independent       :4119   Protestant:10846  
##  Not applicable:7043   Not str democrat  :3690   Catholic  : 5124  
##  $20000 - 24999:1283   Strong democrat   :3490   None      : 3523  
##  $10000 - 14999:1168   Not str republican:3032   Christian :  689  
##  $15000 - 19999:1048   Ind,near dem      :2499   Jewish    :  388  
##  Refused       : 975   Strong republican :2314   Other     :  224  
##  (Other)       :2603   (Other)           :2339   (Other)   :  689  
##               Denom          Tvhours      
##  Not applicable  :10072   Min.   : 0.000  
##  Other           : 2534   1st Qu.: 1.000  
##  No denomination : 1683   Median : 2.000  
##  Southern baptist: 1536   Mean   : 2.981  
##  Baptist-dk which: 1457   3rd Qu.: 4.000  
##  United methodist: 1067   Max.   :24.000  
##  (Other)         : 3134   NA's   :10146

names(gss)

## [1] "Year"    "Marital" "Age"     "Race"    "Rincome" "Partyid" "Relig"  
## [8] "Denom"   "Tvhours"

dim(gss)

## [1] 21483     9

data wrangling

gss%>%pull(Race)%>%unique()

## [1] White Black Other
## Levels: Other Black White Not applicable

class(gss$Race)

## [1] "factor"

count(gss,Race,sort = T)

## # A tibble: 3 × 2
##   Race      n
##   <fct> <int>
## 1 White 16395
## 2 Black  3129
## 3 Other  1959

gss<-gss%>%mutate(Race=fct_drop(Race))

levels(gss$Race)

## [1] "Other" "Black" "White"

gss%>%select(Race)%>%table()%>%barplot()

dim(gss)

## [1] 21483     9

fct_reorder fct_rev fct_infreq

fct_recode

levels(gss$Partyid)

##  [1] "No answer"          "Don't know"         "Other party"       
##  [4] "Strong republican"  "Not str republican" "Ind,near rep"      
##  [7] "Independent"        "Ind,near dem"       "Not str democrat"  
## [10] "Strong democrat"

gss%>%mutate(Partyid=fct_recode(Partyid,"No"="No answer","S.r"="Strong republican")) %>%count(Partyid)

## # A tibble: 10 × 2
##    Partyid                n
##    <fct>              <int>
##  1 No                   154
##  2 Don't know             1
##  3 Other party          393
##  4 S.r                 2314
##  5 Not str republican  3032
##  6 Ind,near rep        1791
##  7 Independent         4119
##  8 Ind,near dem        2499
##  9 Not str democrat    3690
## 10 Strong democrat     3490

fct_collapse

gss%>%mutate(Partyid=fct_collapse(Partyid,Others=c("Don't know","No answer","Other party"),Republicans=c("Not str republican","Strong republican"),Independent=c("Ind,near rep","Independent","Ind,near dem"),Democrats=c("Not str democrat","Strong democrat")))%>%count(Partyid)

## # A tibble: 4 × 2
##   Partyid         n
##   <fct>       <int>
## 1 Others        548
## 2 Republicans  5346
## 3 Independent  8409
## 4 Democrats    7180

factor lump

gss%>%count(Relig)

## # A tibble: 15 × 2
##    Relig                       n
##    <fct>                   <int>
##  1 No answer                  93
##  2 Don't know                 15
##  3 Inter-nondenominational   109
##  4 Native american            23
##  5 Christian                 689
##  6 Orthodox-christian         95
##  7 Moslem/islam              104
##  8 Other eastern              32
##  9 Hinduism                   71
## 10 Buddhism                  147
## 11 Other                     224
## 12 None                     3523
## 13 Jewish                    388
## 14 Catholic                 5124
## 15 Protestant              10846

gss%>%mutate(Relig=fct_lump(Relig,4))%>%count(Relig)

## # A tibble: 5 × 2
##   Relig          n
##   <fct>      <int>
## 1 Christian    689
## 2 None        3523
## 3 Catholic    5124
## 4 Protestant 10846
## 5 Other       1301

reorder

## [1] "No answer"     "Never married" "Separated"     "Divorced"     
## [5] "Widowed"       "Married"

## NULL

p<-function(gss){sum(is.na(gss))/length(gss)*100}#missing value percentages
apply(gss,2,p)

##       Year    Marital        Age       Race    Rincome    Partyid      Relig 
##  0.0000000  0.0000000  0.3537681  0.0000000  0.0000000  0.0000000  0.0000000 
##      Denom    Tvhours 
##  0.0000000 47.2280408

#boxplot(gss$Age[gss$Marital=="Widowed"])

2.0 cars data

## 'data.frame':    50 obs. of  2 variables:
##  $ speed: num  4 4 7 7 8 9 10 10 10 11 ...
##  $ dist : num  2 10 4 22 16 10 18 26 34 17 ...

add column

summary(cars)

##      speed           dist             c           time      
##  Min.   : 4.0   Min.   :  2.00   high  :27   Min.   :0.500  
##  1st Qu.:12.0   1st Qu.: 26.00   low   :15   1st Qu.:1.921  
##  Median :15.0   Median : 36.00   medium: 8   Median :2.523  
##  Mean   :15.4   Mean   : 42.98               Mean   :2.632  
##  3rd Qu.:19.0   3rd Qu.: 56.00               3rd Qu.:3.186  
##  Max.   :25.0   Max.   :120.00               Max.   :5.714

lm<-lm(speed~dist,data=cars)
summary(lm)

## 
## Call:
## lm(formula = speed ~ dist, data = cars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.5293 -2.1550  0.3615  2.4377  6.4179 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.28391    0.87438   9.474 1.44e-12 ***
## dist         0.16557    0.01749   9.464 1.49e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.156 on 48 degrees of freedom
## Multiple R-squared:  0.6511, Adjusted R-squared:  0.6438 
## F-statistic: 89.57 on 1 and 48 DF,  p-value: 1.49e-12

groupby and summarise

p<-cars%>%group_by(c)%>%count()
p

## # A tibble: 3 × 2
## # Groups:   c [3]
##   c          n
##   <fct>  <int>
## 1 high      27
## 2 low       15
## 3 medium     8

plot

cars%>%summarise_at(.vars=vars(speed,dist),.funs = mean)

##   speed  dist
## 1  15.4 42.98

2.0 possum data

library(DAAG)
library(stringr)
data(possum)
names(possum)<-str_to_title(names(possum))
possum$Site<-as.factor(possum$Site)
glimpse(possum)

## Rows: 104
## Columns: 14
## $ Case     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ Site     <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Pop      <fct> Vic, Vic, Vic, Vic, Vic, Vic, Vic, Vic, Vic, Vic, Vic, Vic, V…
## $ Sex      <fct> m, f, f, f, f, f, m, f, f, f, f, f, m, m, m, m, f, m, f, f, f…
## $ Age      <dbl> 8, 6, 6, 6, 2, 1, 2, 6, 9, 6, 9, 5, 5, 3, 5, 4, 1, 2, 5, 4, 3…
## $ Hdlngth  <dbl> 94.1, 92.5, 94.0, 93.2, 91.5, 93.1, 95.3, 94.8, 93.4, 91.8, 9…
## $ Skullw   <dbl> 60.4, 57.6, 60.0, 57.1, 56.3, 54.8, 58.2, 57.6, 56.3, 58.0, 5…
## $ Totlngth <dbl> 89.0, 91.5, 95.5, 92.0, 85.5, 90.5, 89.5, 91.0, 91.5, 89.5, 8…
## $ Taill    <dbl> 36.0, 36.5, 39.0, 38.0, 36.0, 35.5, 36.0, 37.0, 37.0, 37.5, 3…
## $ Footlgth <dbl> 74.5, 72.5, 75.4, 76.1, 71.0, 73.2, 71.5, 72.7, 72.4, 70.9, 7…
## $ Earconch <dbl> 54.5, 51.2, 51.9, 52.2, 53.2, 53.6, 52.0, 53.9, 52.9, 53.4, 5…
## $ Eye      <dbl> 15.2, 16.0, 15.5, 15.2, 15.1, 14.2, 14.2, 14.5, 15.5, 14.4, 1…
## $ Chest    <dbl> 28.0, 28.5, 30.0, 28.0, 28.5, 30.0, 30.0, 29.0, 28.0, 27.5, 3…
## $ Belly    <dbl> 36.0, 33.0, 34.0, 34.0, 33.0, 32.0, 34.5, 34.0, 33.0, 32.0, 3…

select some columns

head(possum%>%select(1:3,10:14))

##     Case Site Pop Footlgth Earconch  Eye Chest Belly
## C3     1    1 Vic     74.5     54.5 15.2  28.0    36
## C5     2    1 Vic     72.5     51.2 16.0  28.5    33
## C10    3    1 Vic     75.4     51.9 15.5  30.0    34
## C15    4    1 Vic     76.1     52.2 15.2  28.0    34
## C23    5    1 Vic     71.0     53.2 15.1  28.5    33
## C24    6    1 Vic     73.2     53.6 14.2  30.0    32

deselect some columns

head(possum%>%select(c(-1,-2,-3)))

##     Sex Age Hdlngth Skullw Totlngth Taill Footlgth Earconch  Eye Chest Belly
## C3    m   8    94.1   60.4     89.0  36.0     74.5     54.5 15.2  28.0    36
## C5    f   6    92.5   57.6     91.5  36.5     72.5     51.2 16.0  28.5    33
## C10   f   6    94.0   60.0     95.5  39.0     75.4     51.9 15.5  30.0    34
## C15   f   6    93.2   57.1     92.0  38.0     76.1     52.2 15.2  28.0    34
## C23   f   2    91.5   56.3     85.5  36.0     71.0     53.2 15.1  28.5    33
## C24   f   1    93.1   54.8     90.5  35.5     73.2     53.6 14.2  30.0    32

filter and arrange

head(possum)

##     Case Site Pop Sex Age Hdlngth Skullw Totlngth Taill Footlgth Earconch  Eye
## C3     1    1 Vic   m   8    94.1   60.4     89.0  36.0     74.5     54.5 15.2
## C5     2    1 Vic   f   6    92.5   57.6     91.5  36.5     72.5     51.2 16.0
## C10    3    1 Vic   f   6    94.0   60.0     95.5  39.0     75.4     51.9 15.5
## C15    4    1 Vic   f   6    93.2   57.1     92.0  38.0     76.1     52.2 15.2
## C23    5    1 Vic   f   2    91.5   56.3     85.5  36.0     71.0     53.2 15.1
## C24    6    1 Vic   f   1    93.1   54.8     90.5  35.5     73.2     53.6 14.2
##     Chest Belly
## C3   28.0    36
## C5   28.5    33
## C10  30.0    34
## C15  28.0    34
## C23  28.5    33
## C24  30.0    32

head(possum%>%filter(Sex=="f",Pop=="Vic",Age<14)%>%arrange(Age))

##      Case Site Pop Sex Age Hdlngth Skullw Totlngth Taill Footlgth Earconch  Eye
## C24     6    1 Vic   f   1    93.1   54.8     90.5  35.5     73.2     53.6 14.2
## C45    17    1 Vic   f   1    94.7   67.7     89.5  36.5     73.2     53.2 14.7
## BB31   39    2 Vic   f   1    84.7   51.5     75.0  34.0     68.7     53.4 13.0
## C23     5    1 Vic   f   2    91.5   56.3     85.5  36.0     71.0     53.2 15.1
## C63    27    1 Vic   f   2    90.5   54.5     85.0  35.0     70.3     50.8 14.2
## A2     30    1 Vic   f   2    92.1   54.4     84.0  33.5     70.6     50.8 14.5
##      Chest Belly
## C24   30.0    32
## C45   29.0    31
## BB31  25.0    25
## C23   28.5    33
## C63   23.0    28
## A2    24.5    33

binary logistic regresion

library(gtsummary)
model<-glm(Sex~.,data=possum,family="binomial")%>%tbl_regression(exponentiate = TRUE)
model

Characteristic	OR¹	95% CI¹	p-value
Case	1.04	0.96, 1.13	0.3
Site
1	—	—
2	0.91	0.07, 13.8	>0.9
3	3.02	0.04, 239	0.6
4	7.59	0.05, 1,279	0.4
5	4.04	0.02, 865	0.6
6	8.90	0.02, 4,536	0.5
7	3.11	0.00, 2,867	0.7
Pop
Vic	—	—
other
Age	0.98	0.72, 1.31	0.9
Hdlngth	1.26	0.96, 1.68	0.10
Skullw	1.07	0.83, 1.44	0.6
Totlngth	0.88	0.67, 1.13	0.3
Taill	0.81	0.50, 1.28	0.4
Footlgth	1.12	0.86, 1.47	0.4
Earconch	1.27	0.90, 1.83	0.2
Eye	1.82	1.04, 3.42	0.045
Chest	0.91	0.61, 1.33	0.6
Belly	0.90	0.70, 1.15	0.4
¹ OR = Odds Ratio, CI = Confidence Interval

group by

possum%>%filter(Sex=="m")%>%group_by(Site)%>%summarise(Avg=mean(Belly),sd=sd(Belly),count=n())%>%arrange(desc(Site))

## # A tibble: 7 × 4
##   Site    Avg    sd count
##   <fct> <dbl> <dbl> <int>
## 1 7      31.8  2.25    14
## 2 6      31.5  2.78     9
## 3 5      30.9  2.28     7
## 4 4      34.6  2.22     5
## 5 3      34    1.47     4
## 6 2      32.1  3.37     8
## 7 1      33.2  2.49    14

dim(possum%>%filter(Sex=="m"))

## [1] 61 14

mutate

library(dplyr)
p<-possum%>%filter(Sex=="m")%>%group_by(Site)%>%reframe(TR=mean(Taill/Eye),count=n())%>%arrange(desc(Site)) 
p

## # A tibble: 7 × 3
##   Site     TR count
##   <fct> <dbl> <int>
## 1 7      2.59    14
## 2 6      2.42     9
## 3 5      2.41     7
## 4 4      2.50     5
## 5 3      2.34     4
## 6 2      2.35     8
## 7 1      2.36    14

histogram with density interactions

possum%>%count(Sex,Age)%>%group_by(Age)%>%mutate(proportion=n/sum(n))%>%ggplot(aes(Age,proportion,color=Sex))+geom_line()+theme_minimal()

## Warning: Removed 1 row containing missing values (`geom_line()`).

marginal plots

library(ggpubr)
library(ggalt)
p<-possum%>%ggplot(aes(Skullw,Hdlngth))+geom_point()+geom_smooth(method="lm",se=0)+geom_encircle(data=possum,aes(Skullw,Hdlngth,expand=0))+ggtitle("scatterplot:hdlength vs skullw")+theme_minimal()
library(av)
library(gifski)
#p+transition_speed(speed)
#p+ggMarginal(p,type='histogram')
library(stringr)
library(gapminder)
names(gapminder)<-str_to_title(names(gapminder))
names(gapminder)

## [1] "Country"   "Continent" "Year"      "Lifeexp"   "Pop"       "Gdppercap"

p<-ggplot(gapminder,aes(Gdppercap,Lifeexp))+geom_point()
#p+transition_time(Year)

loops

#area=2*pi*r*h+2*pi*r^2#cylinder area 
cylinder<-function(r,h,area){area<-2*pi*r*h 
return(cbind(area))}
cylinder(2,2)

##          area
## [1,] 25.13274

cylinder(1:10,20)

##            area
##  [1,]  125.6637
##  [2,]  251.3274
##  [3,]  376.9911
##  [4,]  502.6548
##  [5,]  628.3185
##  [6,]  753.9822
##  [7,]  879.6459
##  [8,] 1005.3096
##  [9,] 1130.9734
## [10,] 1256.6371

cylinder(1:10,20)

##            area
##  [1,]  125.6637
##  [2,]  251.3274
##  [3,]  376.9911
##  [4,]  502.6548
##  [5,]  628.3185
##  [6,]  753.9822
##  [7,]  879.6459
##  [8,] 1005.3096
##  [9,] 1130.9734
## [10,] 1256.6371

#change to a matrix
cylinder<-function(r,h,area){area<-2*pi*r*h 
return(cbind(r,h,area))}
cylinder(1:10,20:29)

##        r  h      area
##  [1,]  1 20  125.6637
##  [2,]  2 21  263.8938
##  [3,]  3 22  414.6902
##  [4,]  4 23  578.0530
##  [5,]  5 24  753.9822
##  [6,]  6 25  942.4778
##  [7,]  7 26 1143.5397
##  [8,]  8 27 1357.1680
##  [9,]  9 28 1583.3627
## [10,] 10 29 1822.1237

cylinder<-function(r,h,area){area<-2*pi*h
return (data.frame(r,h,area))}

cylinder(2:20,42:60)

##     r  h     area
## 1   2 42 263.8938
## 2   3 43 270.1770
## 3   4 44 276.4602
## 4   5 45 282.7433
## 5   6 46 289.0265
## 6   7 47 295.3097
## 7   8 48 301.5929
## 8   9 49 307.8761
## 9  10 50 314.1593
## 10 11 51 320.4425
## 11 12 52 326.7256
## 12 13 53 333.0088
## 13 14 54 339.2920
## 14 15 55 345.5752
## 15 16 56 351.8584
## 16 17 57 358.1416
## 17 18 58 364.4247
## 18 19 59 370.7079
## 19 20 60 376.9911

Data_Manipulation

mugo_muiruri_james

2024-02-04

loops