visualizationnnn

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(nycflights13)
str(flights)

## tibble [336,776 × 19] (S3: tbl_df/tbl/data.frame)
##  $ year          : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ month         : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ day           : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ dep_time      : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
##  $ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
##  $ dep_delay     : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
##  $ arr_time      : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
##  $ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
##  $ arr_delay     : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
##  $ carrier       : chr [1:336776] "UA" "UA" "AA" "B6" ...
##  $ flight        : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
##  $ tailnum       : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
##  $ origin        : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
##  $ dest          : chr [1:336776] "IAH" "IAH" "MIA" "BQN" ...
##  $ air_time      : num [1:336776] 227 227 160 183 116 150 158 53 140 138 ...
##  $ distance      : num [1:336776] 1400 1416 1089 1576 762 ...
##  $ hour          : num [1:336776] 5 5 5 5 6 5 6 6 6 6 ...
##  $ minute        : num [1:336776] 15 29 40 45 0 58 0 0 0 0 ...
##  $ time_hour     : POSIXct[1:336776], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...

head(flights)

## # A tibble: 6 × 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     1      517            515         2      830            819
## 2  2013     1     1      533            529         4      850            830
## 3  2013     1     1      542            540         2      923            850
## 4  2013     1     1      544            545        -1     1004           1022
## 5  2013     1     1      554            600        -6      812            837
## 6  2013     1     1      554            558        -4      740            728
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

CREATE NEW COLUMNS BY COUNT OR GROUP BY

df<-flights%>%mutate(long_flight=(air_time>=6*60))
dim(flights)

## [1] 336776     19

#str(df)
df%>%count(long_flight)

## # A tibble: 3 × 2
##   long_flight      n
##   <lgl>        <int>
## 1 FALSE       322630
## 2 TRUE          4716
## 3 NA            9430

GROUP BY

df2<-flights%>%group_by(date=make_date(year,month,day))%>%summarise(flights_n=n(),air_time_median=median(air_time,na.rm=TRUE))%>%ungroup()
df2

## # A tibble: 365 × 3
##    date       flights_n air_time_median
##    <date>         <int>           <dbl>
##  1 2013-01-01       842            149 
##  2 2013-01-02       943            148 
##  3 2013-01-03       914            148 
##  4 2013-01-04       915            140 
##  5 2013-01-05       720            147 
##  6 2013-01-06       832            147 
##  7 2013-01-07       933            126.
##  8 2013-01-08       899            126.
##  9 2013-01-09       902            135 
## 10 2013-01-10       932            126 
## # ℹ 355 more rows

SAMPLE

v<-flights%>% slice_sample(prop=0.01)
v

## # A tibble: 3,367 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     5     8      616            629       -13      748            825
##  2  2013     5    31     1842           1835         7     2022           2049
##  3  2013     1    17      954            845        69     1120           1006
##  4  2013    10    18     1559           1550         9     1838           1816
##  5  2013     6    24     1704           1553        71     1846           1709
##  6  2013     9    26     1256           1259        -3     1523           1501
##  7  2013     2    17     2204           2112        52     2318           2224
##  8  2013    12    27     1412           1410         2     1628           1701
##  9  2013     1     8     1645           1645         0     1854           1900
## 10  2013    11    15     1245           1247        -2     1356           1415
## # ℹ 3,357 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

f<-flights%>% slice_sample(prop=1)
f

## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     4     9     1450           1350        60     1736           1638
##  2  2013    10    11     1433           1429         4     1724           1741
##  3  2013     7    15     1503           1455         8     1656           1645
##  4  2013    10     6     1439           1445        -6     1601           1629
##  5  2013     2     8       NA           1635        NA       NA           1856
##  6  2013     7    17     1505           1503         2     1719           1659
##  7  2013     8    13     1315           1245        30     1415           1404
##  8  2013     4    16     1552           1600        -8     1855           1901
##  9  2013     8    18     1549           1530        19     1735           1715
## 10  2013     1     3      901            900         1     1031           1048
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

h<-flights%>%slice_sample(n=3)
h

## # A tibble: 3 × 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     8     5      601            601         0      920            915
## 2  2013    10    27      645            650        -5      811            819
## 3  2013     9     7      720            730       -10      804            827
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

GENERATE 3 RANDOM FLIGHTS BASED ON ORIGIN

nn<-flights%>%group_by(origin) %>% slice_sample(n=3)
nn

## # A tibble: 9 × 19
## # Groups:   origin [3]
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1    11     1543           1555       -12     1754           1810
## 2  2013    12    21     1826           1825         1     2132           2152
## 3  2013     6     2     1724           1450       154     1859           1642
## 4  2013     6    18      553            600        -7      713            712
## 5  2013    10     5     1820           1829        -9     1912           1949
## 6  2013     1    14     1725           1725         0     2031           2040
## 7  2013     4    28     2139           2130         9     2309           2300
## 8  2013     6     1      824            830        -6      948           1015
## 9  2013     8    15      756            802        -6      908            930
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

CREATE DATE

bbb<-flights%>%select(year,month,day)%>%mutate(date=make_date(year,month,day))
bbb

## # A tibble: 336,776 × 4
##     year month   day date      
##    <int> <int> <int> <date>    
##  1  2013     1     1 2013-01-01
##  2  2013     1     1 2013-01-01
##  3  2013     1     1 2013-01-01
##  4  2013     1     1 2013-01-01
##  5  2013     1     1 2013-01-01
##  6  2013     1     1 2013-01-01
##  7  2013     1     1 2013-01-01
##  8  2013     1     1 2013-01-01
##  9  2013     1     1 2013-01-01
## 10  2013     1     1 2013-01-01
## # ℹ 336,766 more rows

SELECT START_WITH

flights%>% select(starts_with("dep_"))

## # A tibble: 336,776 × 2
##    dep_time dep_delay
##       <int>     <dbl>
##  1      517         2
##  2      533         4
##  3      542         2
##  4      544        -1
##  5      554        -6
##  6      554        -4
##  7      555        -5
##  8      557        -3
##  9      557        -3
## 10      558        -2
## # ℹ 336,766 more rows

flights%>% select(starts_with("dep_"),everything())

## # A tibble: 336,776 × 19
##    dep_time dep_delay  year month   day sched_dep_time arr_time sched_arr_time
##       <int>     <dbl> <int> <int> <int>          <int>    <int>          <int>
##  1      517         2  2013     1     1            515      830            819
##  2      533         4  2013     1     1            529      850            830
##  3      542         2  2013     1     1            540      923            850
##  4      544        -1  2013     1     1            545     1004           1022
##  5      554        -6  2013     1     1            600      812            837
##  6      554        -4  2013     1     1            558      740            728
##  7      555        -5  2013     1     1            600      913            854
##  8      557        -3  2013     1     1            600      709            723
##  9      557        -3  2013     1     1            600      838            846
## 10      558        -2  2013     1     1            600      753            745
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

flights%>%select(ends_with("hour"))

## # A tibble: 336,776 × 2
##     hour time_hour          
##    <dbl> <dttm>             
##  1     5 2013-01-01 05:00:00
##  2     5 2013-01-01 05:00:00
##  3     5 2013-01-01 05:00:00
##  4     5 2013-01-01 05:00:00
##  5     6 2013-01-01 06:00:00
##  6     5 2013-01-01 05:00:00
##  7     6 2013-01-01 06:00:00
##  8     6 2013-01-01 06:00:00
##  9     6 2013-01-01 06:00:00
## 10     6 2013-01-01 06:00:00
## # ℹ 336,766 more rows

flights%>%select(contains("dep"))

## # A tibble: 336,776 × 3
##    dep_time sched_dep_time dep_delay
##       <int>          <int>     <dbl>
##  1      517            515         2
##  2      533            529         4
##  3      542            540         2
##  4      544            545        -1
##  5      554            600        -6
##  6      554            558        -4
##  7      555            600        -5
##  8      557            600        -3
##  9      557            600        -3
## 10      558            600        -2
## # ℹ 336,766 more rows

CASE WHEN

flights%>%mutate(origin=case_when(origin=="EWR"~"NEWYORK INTERNATIONAL AIRPORT",
origin=="JFK"~"JOHN KENNEDY",origin=="LGA"~"LAIRPORT"))%>%count(origin)

## # A tibble: 3 × 2
##   origin                             n
##   <chr>                          <int>
## 1 JOHN KENNEDY                  111279
## 2 LAIRPORT                      104662
## 3 NEWYORK INTERNATIONAL AIRPORT 120835

STR_REPLACE_ALL

#flights%>%mutate(origin=str_replace_all(origin,
#c("^EWR$="NEW YORK","^JFK$"="KENNEDY","^LGA$"="LAGORF")))%>%count(origin)

flights%>%transmute(date=make_date(year,month,day))

## # A tibble: 336,776 × 1
##    date      
##    <date>    
##  1 2013-01-01
##  2 2013-01-01
##  3 2013-01-01
##  4 2013-01-01
##  5 2013-01-01
##  6 2013-01-01
##  7 2013-01-01
##  8 2013-01-01
##  9 2013-01-01
## 10 2013-01-01
## # ℹ 336,766 more rows

airlines

## # A tibble: 16 × 2
##    carrier name                       
##    <chr>   <chr>                      
##  1 9E      Endeavor Air Inc.          
##  2 AA      American Airlines Inc.     
##  3 AS      Alaska Airlines Inc.       
##  4 B6      JetBlue Airways            
##  5 DL      Delta Air Lines Inc.       
##  6 EV      ExpressJet Airlines Inc.   
##  7 F9      Frontier Airlines Inc.     
##  8 FL      AirTran Airways Corporation
##  9 HA      Hawaiian Airlines Inc.     
## 10 MQ      Envoy Air                  
## 11 OO      SkyWest Airlines Inc.      
## 12 UA      United Air Lines Inc.      
## 13 US      US Airways Inc.            
## 14 VX      Virgin America             
## 15 WN      Southwest Airlines Co.     
## 16 YV      Mesa Airlines Inc.

str(airlines)

## tibble [16 × 2] (S3: tbl_df/tbl/data.frame)
##  $ carrier: chr [1:16] "9E" "AA" "AS" "B6" ...
##  $ name   : chr [1:16] "Endeavor Air Inc." "American Airlines Inc." "Alaska Airlines Inc." "JetBlue Airways" ...

airlines%>%mutate(names=name%>%str_to_upper())%>%gsub("(INC|CO)\\.?$","")

## Warning in gsub(., "(INC|CO)\\.?$", ""): argument 'pattern' has length > 1 and
## only the first element will be used

## [1] ""

airlines%>%mutate(names=name%>%str_to_upper())%>%str_replace_all("AIR?(LINES|WAYS)?(CORPORATION)?$","")

## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : argument is not an atomic vector; coercing

## [1] "c(\"9E\", \"AA\", \"AS\", \"B6\", \"DL\", \"EV\", \"F9\", \"FL\", \"HA\", \"MQ\", \"OO\", \"UA\", \"US\", \"VX\", \"WN\", \"YV\")"                                                                                                                                                                                                                                                                                     
## [2] "c(\"Endeavor Air Inc.\", \"American Airlines Inc.\", \"Alaska Airlines Inc.\", \"JetBlue Airways\", \"Delta Air Lines Inc.\", \"ExpressJet Airlines Inc.\", \"Frontier Airlines Inc.\", \"AirTran Airways Corporation\", \"Hawaiian Airlines Inc.\", \"Envoy Air\", \"SkyWest Airlines Inc.\", \"United Air Lines Inc.\", \"US Airways Inc.\", \"Virgin America\", \"Southwest Airlines Co.\", \"Mesa Airlines Inc.\")"
## [3] "c(\"ENDEAVOR AIR INC.\", \"AMERICAN AIRLINES INC.\", \"ALASKA AIRLINES INC.\", \"JETBLUE AIRWAYS\", \"DELTA AIR LINES INC.\", \"EXPRESSJET AIRLINES INC.\", \"FRONTIER AIRLINES INC.\", \"AIRTRAN AIRWAYS CORPORATION\", \"HAWAIIAN AIRLINES INC.\", \"ENVOY AIR\", \"SKYWEST AIRLINES INC.\", \"UNITED AIR LINES INC.\", \"US AIRWAYS INC.\", \"VIRGIN AMERICA\", \"SOUTHWEST AIRLINES CO.\", \"MESA AIRLINES INC.\")"

mtcars%>%group_by(cyl)%>%summarise_at(.vars=vars(mpg,disp,qsec),.funs=mean)

## # A tibble: 3 × 4
##     cyl   mpg  disp  qsec
##   <dbl> <dbl> <dbl> <dbl>
## 1     4  26.7  105.  19.1
## 2     6  19.7  183.  18.0
## 3     8  15.1  353.  16.8

mtcars%>%mutate(cyl=factor(cyl,levels=c(4,6,8),labels=c("4c","3d","6r")))%>%glimpse()

## Rows: 32
## Columns: 11
## $ mpg  <dbl> 21.0, 21.0, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19.2, 17.8,…
## $ cyl  <fct> 3d, 3d, 4c, 3d, 6r, 3d, 6r, 4c, 4c, 3d, 3d, 6r, 6r, 6r, 6r, 6r, 6…
## $ disp <dbl> 160.0, 160.0, 108.0, 258.0, 360.0, 225.0, 360.0, 146.7, 140.8, 16…
## $ hp   <dbl> 110, 110, 93, 110, 175, 105, 245, 62, 95, 123, 123, 180, 180, 180…
## $ drat <dbl> 3.90, 3.90, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92,…
## $ wt   <dbl> 2.620, 2.875, 2.320, 3.215, 3.440, 3.460, 3.570, 3.190, 3.150, 3.…
## $ qsec <dbl> 16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20.00, 22.90, 18…
## $ vs   <dbl> 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,…
## $ am   <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,…
## $ gear <dbl> 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3,…
## $ carb <dbl> 4, 4, 1, 1, 2, 1, 4, 2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2,…

VISUALIZATION

str(mpg)

## tibble [234 × 11] (S3: tbl_df/tbl/data.frame)
##  $ manufacturer: chr [1:234] "audi" "audi" "audi" "audi" ...
##  $ model       : chr [1:234] "a4" "a4" "a4" "a4" ...
##  $ displ       : num [1:234] 1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
##  $ year        : int [1:234] 1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
##  $ cyl         : int [1:234] 4 4 4 4 6 6 6 4 4 4 ...
##  $ trans       : chr [1:234] "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
##  $ drv         : chr [1:234] "f" "f" "f" "f" ...
##  $ cty         : int [1:234] 18 21 20 21 16 18 18 18 16 20 ...
##  $ hwy         : int [1:234] 29 29 31 30 26 26 27 26 25 28 ...
##  $ fl          : chr [1:234] "p" "p" "p" "p" ...
##  $ class       : chr [1:234] "compact" "compact" "compact" "compact" ...

mpg%>%ggplot(aes(displ,cyl))+geom_point(aes(colour=drv,size=trans))

## Warning: Using size for a discrete variable is not advised.

mpg%>%ggplot(aes(cyl,displ))+geom_col()+theme_minimal()

mpg%>%ggplot(aes(displ,cyl))+geom_point()+geom_point(aes(colour=drv,size=trans))+geom_smooth(method=lm,se=0)+facet_wrap(~year,nrow=1)+labs(x="engine size",y="mpg in the city",title="fuel efficiency")+coord_flip()+theme_minimal()

## Warning: Using size for a discrete variable is not advised.

## `geom_smooth()` using formula = 'y ~ x'

ggplot(mpg)+geom_bar(aes(x=class))+ coord_cartesian(ylim=c(5,60))

ggplot(mpg)+geom_bar(aes(x=class))#+ facet_wrap(~year)

f<-mpg%>%group_by(class)%>%count()#%>%arrange(desc(n))
ggplot(mpg)+geom_bar(aes(x=cyl))

ggplot(mpg)+geom_histogram(aes(displ))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

mpg<-mpg%>%count(class)%>%mutate(class=fct_reorder(class,n))%>%ggplot(aes(class,n),stat='identity')+geom_col(fill="blue")
mpg

#n<-mpg%>%ggplot(aes(x=class))+geom_bar()
#n
#c<-ggplot(mpg)+geom_bar(aes(x=class,after-stat(100*count/sum(count))))
#c

str(CO2)

## Classes 'nfnGroupedData', 'nfGroupedData', 'groupedData' and 'data.frame':   84 obs. of  5 variables:
##  $ Plant    : Ord.factor w/ 12 levels "Qn1"<"Qn2"<"Qn3"<..: 1 1 1 1 1 1 1 2 2 2 ...
##  $ Type     : Factor w/ 2 levels "Quebec","Mississippi": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Treatment: Factor w/ 2 levels "nonchilled","chilled": 1 1 1 1 1 1 1 1 1 1 ...
##  $ conc     : num  95 175 250 350 500 675 1000 95 175 250 ...
##  $ uptake   : num  16 30.4 34.8 37.2 35.3 39.2 39.7 13.6 27.3 37.1 ...
##  - attr(*, "formula")=Class 'formula'  language uptake ~ conc | Plant
##   .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv> 
##  - attr(*, "outer")=Class 'formula'  language ~Treatment * Type
##   .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv> 
##  - attr(*, "labels")=List of 2
##   ..$ x: chr "Ambient carbon dioxide concentration"
##   ..$ y: chr "CO2 uptake rate"
##  - attr(*, "units")=List of 2
##   ..$ x: chr "(uL/L)"
##   ..$ y: chr "(umol/m^2 s)"

CO2%>%group_by(Type)%>%count()%>%ggplot(aes(Type,n))+geom_col()

CO2 %>%group_by(Treatment)%>%summarise_at(.vars=vars(conc,uptake),.funs=mean)

## # A tibble: 2 × 3
##   Treatment   conc uptake
##   <fct>      <dbl>  <dbl>
## 1 nonchilled   435   30.6
## 2 chilled      435   23.8

CO2 %>%group_by(Treatment)%>%summarise_at(.vars=vars(conc,uptake),.funs=mean)%>%ggplot(aes(Treatment,uptake))+geom_col(fill="brown")

CO2 %>%group_by(Treatment)%>%summarise_at(.vars=vars(conc,uptake),.funs=mean)%>%ggplot(aes(conc))+geom_histogram(fill="brown")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

CO2 %>%group_by(Treatment)%>%summarise_at(.vars=vars(conc,uptake),.funs=mean)%>%ggplot(aes(Treatment,conc))+geom_col(fill="green")

str(mtcars)

## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...

mtcars$gear<-as.factor(mtcars$gear)
str(mtcars)

## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: Factor w/ 3 levels "3","4","5": 2 2 2 1 1 1 1 2 2 2 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...

mtcars%>%group_by(gear)%>%summarise_at(.vars=vars(mpg,qsec,mpg,wt,hp),.funs=mean)

## # A tibble: 3 × 5
##   gear    mpg  qsec    wt    hp
##   <fct> <dbl> <dbl> <dbl> <dbl>
## 1 3      16.1  17.7  3.89 176. 
## 2 4      24.5  19.0  2.62  89.5
## 3 5      21.4  15.6  2.63 196.

mtcars%>%group_by(cyl)%>%summarise_at(.vars=vars(mpg,qsec),.funs=mean)

## # A tibble: 3 × 3
##     cyl   mpg  qsec
##   <dbl> <dbl> <dbl>
## 1     4  26.7  19.1
## 2     6  19.7  18.0
## 3     8  15.1  16.8

p2<-mtcars%>%group_by(gear)%>%summarise_at(.vars=vars(mpg,qsec,hp),.funs=mean)%>%ggplot(aes(gear,hp))+geom_col(fill="grey")
p2<-mtcars%>%group_by(gear)%>%summarise_at(.vars=vars(mpg,qsec,hp),.funs=mean)%>%mutate(gear=fct_reorder(gear,mpg))%>%ggplot(aes(gear,mpg))+geom_col(fill="purple")
p2

p<-mtcars%>%group_by(cyl)%>%summarise_at(.vars=vars(mpg,qsec),.funs=mean)%>%ggplot(aes(cyl,mpg))+geom_col(fill="orange")
p

``` pe<-mtcars%>%group_by(cyl)%>%summarise_at(.vars=vars(mpg,qsec),.funs=mean)%>% ggplot(aes(cyl,qsec))+geom_col(colour=“blue”,fill=“yellow”) pe

visualizationnnn

mugo_muiruri_james

2023-10-12