도로교통공단 2일차

# 파생변수 생성
setwd("C:/Data")
getwd()

## [1] "C:/Data"

library(dplyr)

## 
## 다음의 패키지를 부착합니다: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

data(cars)
glimpse(cars)

## Rows: 50
## Columns: 2
## $ speed <dbl> 4, 4, 7, 7, 8, 9, 10, 10, 10, 11, 11, 12, 12, 12, 12, 13, 13, 13…
## $ dist  <dbl> 2, 10, 4, 22, 16, 10, 18, 26, 34, 17, 28, 14, 20, 24, 28, 26, 34…

summary(cars) # 결측값이 존재하지 않음

##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

plot(cars)

car_model<-lm(dist~speed,data=cars)
coef(car_model)

## (Intercept)       speed 
##  -17.579095    3.932409

abline(car_model,col='red')

fitted(car_model)

##         1         2         3         4         5         6         7         8 
## -1.849460 -1.849460  9.947766  9.947766 13.880175 17.812584 21.744993 21.744993 
##         9        10        11        12        13        14        15        16 
## 21.744993 25.677401 25.677401 29.609810 29.609810 29.609810 29.609810 33.542219 
##        17        18        19        20        21        22        23        24 
## 33.542219 33.542219 33.542219 37.474628 37.474628 37.474628 37.474628 41.407036 
##        25        26        27        28        29        30        31        32 
## 41.407036 41.407036 45.339445 45.339445 49.271854 49.271854 49.271854 53.204263 
##        33        34        35        36        37        38        39        40 
## 53.204263 53.204263 53.204263 57.136672 57.136672 57.136672 61.069080 61.069080 
##        41        42        43        44        45        46        47        48 
## 61.069080 61.069080 61.069080 68.933898 72.866307 76.798715 76.798715 76.798715 
##        49        50 
## 76.798715 80.731124

residuals(car_model)

##          1          2          3          4          5          6          7 
##   3.849460  11.849460  -5.947766  12.052234   2.119825  -7.812584  -3.744993 
##          8          9         10         11         12         13         14 
##   4.255007  12.255007  -8.677401   2.322599 -15.609810  -9.609810  -5.609810 
##         15         16         17         18         19         20         21 
##  -1.609810  -7.542219   0.457781   0.457781  12.457781 -11.474628  -1.474628 
##         22         23         24         25         26         27         28 
##  22.525372  42.525372 -21.407036 -15.407036  12.592964 -13.339445  -5.339445 
##         29         30         31         32         33         34         35 
## -17.271854  -9.271854   0.728146 -11.204263   2.795737  22.795737  30.795737 
##         36         37         38         39         40         41         42 
## -21.136672 -11.136672  10.863328 -29.069080 -13.069080  -9.069080  -5.069080 
##         43         44         45         46         47         48         49 
##   2.930920  -2.933898 -18.866307  -6.798715  15.201285  16.201285  43.201285 
##         50 
##   4.268876

nx1<-data.frame(speed=c(21.5))
predict(car_model,nx1)

##        1 
## 66.96769

nx<-data.frame(speed=c(21.5,25.0,25.5,26.0,26.5,27.0,28.0))
plot(nx$speed,predict(car_model,nx),col='red',cex=2,pch=20)
abline(car_model)



cars1<-cars %>% mutate(violation=ifelse(cars$speed>15.4,"fast","slow"))
cars1

##    speed dist violation
## 1      4    2      slow
## 2      4   10      slow
## 3      7    4      slow
## 4      7   22      slow
## 5      8   16      slow
## 6      9   10      slow
## 7     10   18      slow
## 8     10   26      slow
## 9     10   34      slow
## 10    11   17      slow
## 11    11   28      slow
## 12    12   14      slow
## 13    12   20      slow
## 14    12   24      slow
## 15    12   28      slow
## 16    13   26      slow
## 17    13   34      slow
## 18    13   34      slow
## 19    13   46      slow
## 20    14   26      slow
## 21    14   36      slow
## 22    14   60      slow
## 23    14   80      slow
## 24    15   20      slow
## 25    15   26      slow
## 26    15   54      slow
## 27    16   32      fast
## 28    16   40      fast
## 29    17   32      fast
## 30    17   40      fast
## 31    17   50      fast
## 32    18   42      fast
## 33    18   56      fast
## 34    18   76      fast
## 35    18   84      fast
## 36    19   36      fast
## 37    19   46      fast
## 38    19   68      fast
## 39    20   32      fast
## 40    20   48      fast
## 41    20   52      fast
## 42    20   56      fast
## 43    20   64      fast
## 44    22   66      fast
## 45    23   54      fast
## 46    24   70      fast
## 47    24   92      fast
## 48    24   93      fast
## 49    24  120      fast
## 50    25   85      fast

glimpse(cars1)

## Rows: 50
## Columns: 3
## $ speed     <dbl> 4, 4, 7, 7, 8, 9, 10, 10, 10, 11, 11, 12, 12, 12, 12, 13, 13…
## $ dist      <dbl> 2, 10, 4, 22, 16, 10, 18, 26, 34, 17, 28, 14, 20, 24, 28, 26…
## $ violation <chr> "slow", "slow", "slow", "slow", "slow", "slow", "slow", "slo…

cars1$violation<-factor(cars1$violation)
glimpse(cars1)

## Rows: 50
## Columns: 3
## $ speed     <dbl> 4, 4, 7, 7, 8, 9, 10, 10, 10, 11, 11, 12, 12, 12, 12, 13, 13…
## $ dist      <dbl> 2, 10, 4, 22, 16, 10, 18, 26, 34, 17, 28, 14, 20, 24, 28, 26…
## $ violation <fct> slow, slow, slow, slow, slow, slow, slow, slow, slow, slow, …

cars1 %>% group_by(violation) %>% summarize(p=mean(speed))

## # A tibble: 2 × 2
##   violation     p
##   <fct>     <dbl>
## 1 fast       19.9
## 2 slow       11.2

# hflights
library(hflights)
summary(hflights)

##       Year          Month          DayofMonth      DayOfWeek        DepTime    
##  Min.   :2011   Min.   : 1.000   Min.   : 1.00   Min.   :1.000   Min.   :   1  
##  1st Qu.:2011   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.:2.000   1st Qu.:1021  
##  Median :2011   Median : 7.000   Median :16.00   Median :4.000   Median :1416  
##  Mean   :2011   Mean   : 6.514   Mean   :15.74   Mean   :3.948   Mean   :1396  
##  3rd Qu.:2011   3rd Qu.: 9.000   3rd Qu.:23.00   3rd Qu.:6.000   3rd Qu.:1801  
##  Max.   :2011   Max.   :12.000   Max.   :31.00   Max.   :7.000   Max.   :2400  
##                                                                  NA's   :2905  
##     ArrTime     UniqueCarrier        FlightNum      TailNum         
##  Min.   :   1   Length:227496      Min.   :   1   Length:227496     
##  1st Qu.:1215   Class :character   1st Qu.: 855   Class :character  
##  Median :1617   Mode  :character   Median :1696   Mode  :character  
##  Mean   :1578                      Mean   :1962                     
##  3rd Qu.:1953                      3rd Qu.:2755                     
##  Max.   :2400                      Max.   :7290                     
##  NA's   :3066                                                       
##  ActualElapsedTime    AirTime         ArrDelay          DepDelay      
##  Min.   : 34.0     Min.   : 11.0   Min.   :-70.000   Min.   :-33.000  
##  1st Qu.: 77.0     1st Qu.: 58.0   1st Qu.: -8.000   1st Qu.: -3.000  
##  Median :128.0     Median :107.0   Median :  0.000   Median :  0.000  
##  Mean   :129.3     Mean   :108.1   Mean   :  7.094   Mean   :  9.445  
##  3rd Qu.:165.0     3rd Qu.:141.0   3rd Qu.: 11.000   3rd Qu.:  9.000  
##  Max.   :575.0     Max.   :549.0   Max.   :978.000   Max.   :981.000  
##  NA's   :3622      NA's   :3622    NA's   :3622      NA's   :2905     
##     Origin              Dest              Distance          TaxiIn       
##  Length:227496      Length:227496      Min.   :  79.0   Min.   :  1.000  
##  Class :character   Class :character   1st Qu.: 376.0   1st Qu.:  4.000  
##  Mode  :character   Mode  :character   Median : 809.0   Median :  5.000  
##                                        Mean   : 787.8   Mean   :  6.099  
##                                        3rd Qu.:1042.0   3rd Qu.:  7.000  
##                                        Max.   :3904.0   Max.   :165.000  
##                                                         NA's   :3066     
##     TaxiOut         Cancelled       CancellationCode      Diverted       
##  Min.   :  1.00   Min.   :0.00000   Length:227496      Min.   :0.000000  
##  1st Qu.: 10.00   1st Qu.:0.00000   Class :character   1st Qu.:0.000000  
##  Median : 14.00   Median :0.00000   Mode  :character   Median :0.000000  
##  Mean   : 15.09   Mean   :0.01307                      Mean   :0.002853  
##  3rd Qu.: 18.00   3rd Qu.:0.00000                      3rd Qu.:0.000000  
##  Max.   :163.00   Max.   :1.00000                      Max.   :1.000000  
##  NA's   :2947

glimpse(hflights)

## Rows: 227,496
## Columns: 21
## $ Year              <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011…
## $ Month             <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ DayofMonth        <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
## $ DayOfWeek         <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2…
## $ DepTime           <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 1355, 1443…
## $ ArrTime           <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 1454, 1554…
## $ UniqueCarrier     <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA"…
## $ FlightNum         <int> 428, 428, 428, 428, 428, 428, 428, 428, 428, 428, 42…
## $ TailNum           <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N492AA", "N…
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, 56, 63, …
## $ AirTime           <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, 41, 44, …
## $ ArrDelay          <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29, 5, -9, …
## $ DepDelay          <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, -2, -3, …
## $ Origin            <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "IA…
## $ Dest              <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "DF…
## $ Distance          <int> 224, 224, 224, 224, 224, 224, 224, 224, 224, 224, 22…
## $ TaxiIn            <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6, 12, 8,…
## $ TaxiOut           <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11, 13, 15…
## $ Cancelled         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ CancellationCode  <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", …
## $ Diverted          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

# Month 빈도분석 
hflights %>% count(Month)

##    Month     n
## 1      1 18910
## 2      2 17128
## 3      3 19470
## 4      4 18593
## 5      5 19172
## 6      6 19600
## 7      7 20548
## 8      8 20176
## 9      9 18065
## 10    10 18696
## 11    11 18021
## 12    12 19117

# Month가 2월 8월 데이터 추출
hflights %>% filter(Month==2|Month==8) %>% dim()

## [1] 37304    21

# Dest 도착지점(공항명)
glimpse(hflights)

## Rows: 227,496
## Columns: 21
## $ Year              <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011…
## $ Month             <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ DayofMonth        <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
## $ DayOfWeek         <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2…
## $ DepTime           <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 1355, 1443…
## $ ArrTime           <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 1454, 1554…
## $ UniqueCarrier     <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA"…
## $ FlightNum         <int> 428, 428, 428, 428, 428, 428, 428, 428, 428, 428, 42…
## $ TailNum           <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N492AA", "N…
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, 56, 63, …
## $ AirTime           <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, 41, 44, …
## $ ArrDelay          <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29, 5, -9, …
## $ DepDelay          <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, -2, -3, …
## $ Origin            <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "IA…
## $ Dest              <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "DF…
## $ Distance          <int> 224, 224, 224, 224, 224, 224, 224, 224, 224, 224, 22…
## $ TaxiIn            <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6, 12, 8,…
## $ TaxiOut           <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11, 13, 15…
## $ Cancelled         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ CancellationCode  <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", …
## $ Diverted          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

hflights %>% count(Dest)

##     Dest    n
## 1    ABQ 2812
## 2    AEX  724
## 3    AGS    1
## 4    AMA 1297
## 5    ANC  125
## 6    ASE  125
## 7    ATL 7886
## 8    AUS 5022
## 9    AVL  350
## 10   BFL  504
## 11   BHM 2736
## 12   BKG  110
## 13   BNA 3481
## 14   BOS 1752
## 15   BPT    3
## 16   BRO 1692
## 17   BTR 1762
## 18   BWI 2551
## 19   CAE  561
## 20   CHS 1200
## 21   CID  410
## 22   CLE 2140
## 23   CLT 4735
## 24   CMH 1348
## 25   COS 1657
## 26   CRP 4813
## 27   CRW  357
## 28   CVG 1535
## 29   DAL 9820
## 30   DAY  451
## 31   DCA 2699
## 32   DEN 5920
## 33   DFW 6653
## 34   DSM  647
## 35   DTW 2601
## 36   ECP  729
## 37   EGE  110
## 38   ELP 3036
## 39   EWR 4314
## 40   FLL 2462
## 41   GJT  403
## 42   GPT 1618
## 43   GRK   42
## 44   GRR  677
## 45   GSO  630
## 46   GSP 1123
## 47   GUC   86
## 48   HDN  110
## 49   HNL  402
## 50   HOB  309
## 51   HRL 3983
## 52   HSV  923
## 53   IAD 1980
## 54   ICT 1517
## 55   IND 1750
## 56   JAN 2011
## 57   JAX 2135
## 58   JFK  695
## 59   LAS 4082
## 60   LAX 6064
## 61   LBB 1333
## 62   LCH  364
## 63   LEX  584
## 64   LFT 2313
## 65   LGA 2730
## 66   LIT 1579
## 67   LRD 1188
## 68   MAF 2306
## 69   MCI 3174
## 70   MCO 3687
## 71   MDW 2094
## 72   MEM 2399
## 73   MFE 1128
## 74   MIA 2463
## 75   MKE 1588
## 76   MLU  292
## 77   MOB 1674
## 78   MSP 2010
## 79   MSY 6823
## 80   MTJ  164
## 81   OAK  690
## 82   OKC 3170
## 83   OMA 2044
## 84   ONT  952
## 85   ORD 5748
## 86   ORF  717
## 87   PBI 1253
## 88   PDX 1235
## 89   PHL 2367
## 90   PHX 5096
## 91   PIT 1664
## 92   PNS 1539
## 93   PSP  106
## 94   RDU 1740
## 95   RIC  900
## 96   RNO  243
## 97   RSW  948
## 98   SAN 2936
## 99   SAT 4893
## 100  SAV  863
## 101  SDF 1279
## 102  SEA 2615
## 103  SFO 2818
## 104  SHV  787
## 105  SJC  885
## 106  SJU  391
## 107  SLC 2033
## 108  SMF 1014
## 109  SNA 1661
## 110  STL 2509
## 111  TPA 3085
## 112  TUL 2924
## 113  TUS 1565
## 114  TYS 1210
## 115  VPS  880
## 116  XNA 1172

# 최댓값, 최솟값 찾기
hflights %>% count(Dest) %>% arrange(n)

##     Dest    n
## 1    AGS    1
## 2    BPT    3
## 3    GRK   42
## 4    GUC   86
## 5    PSP  106
## 6    BKG  110
## 7    EGE  110
## 8    HDN  110
## 9    ANC  125
## 10   ASE  125
## 11   MTJ  164
## 12   RNO  243
## 13   MLU  292
## 14   HOB  309
## 15   AVL  350
## 16   CRW  357
## 17   LCH  364
## 18   SJU  391
## 19   HNL  402
## 20   GJT  403
## 21   CID  410
## 22   DAY  451
## 23   BFL  504
## 24   CAE  561
## 25   LEX  584
## 26   GSO  630
## 27   DSM  647
## 28   GRR  677
## 29   OAK  690
## 30   JFK  695
## 31   ORF  717
## 32   AEX  724
## 33   ECP  729
## 34   SHV  787
## 35   SAV  863
## 36   VPS  880
## 37   SJC  885
## 38   RIC  900
## 39   HSV  923
## 40   RSW  948
## 41   ONT  952
## 42   SMF 1014
## 43   GSP 1123
## 44   MFE 1128
## 45   XNA 1172
## 46   LRD 1188
## 47   CHS 1200
## 48   TYS 1210
## 49   PDX 1235
## 50   PBI 1253
## 51   SDF 1279
## 52   AMA 1297
## 53   LBB 1333
## 54   CMH 1348
## 55   ICT 1517
## 56   CVG 1535
## 57   PNS 1539
## 58   TUS 1565
## 59   LIT 1579
## 60   MKE 1588
## 61   GPT 1618
## 62   COS 1657
## 63   SNA 1661
## 64   PIT 1664
## 65   MOB 1674
## 66   BRO 1692
## 67   RDU 1740
## 68   IND 1750
## 69   BOS 1752
## 70   BTR 1762
## 71   IAD 1980
## 72   MSP 2010
## 73   JAN 2011
## 74   SLC 2033
## 75   OMA 2044
## 76   MDW 2094
## 77   JAX 2135
## 78   CLE 2140
## 79   MAF 2306
## 80   LFT 2313
## 81   PHL 2367
## 82   MEM 2399
## 83   FLL 2462
## 84   MIA 2463
## 85   STL 2509
## 86   BWI 2551
## 87   DTW 2601
## 88   SEA 2615
## 89   DCA 2699
## 90   LGA 2730
## 91   BHM 2736
## 92   ABQ 2812
## 93   SFO 2818
## 94   TUL 2924
## 95   SAN 2936
## 96   ELP 3036
## 97   TPA 3085
## 98   OKC 3170
## 99   MCI 3174
## 100  BNA 3481
## 101  MCO 3687
## 102  HRL 3983
## 103  LAS 4082
## 104  EWR 4314
## 105  CLT 4735
## 106  CRP 4813
## 107  SAT 4893
## 108  AUS 5022
## 109  PHX 5096
## 110  ORD 5748
## 111  DEN 5920
## 112  LAX 6064
## 113  DFW 6653
## 114  MSY 6823
## 115  ATL 7886
## 116  DAL 9820

hflights %>% count(Dest) %>% arrange(desc(n))

##     Dest    n
## 1    DAL 9820
## 2    ATL 7886
## 3    MSY 6823
## 4    DFW 6653
## 5    LAX 6064
## 6    DEN 5920
## 7    ORD 5748
## 8    PHX 5096
## 9    AUS 5022
## 10   SAT 4893
## 11   CRP 4813
## 12   CLT 4735
## 13   EWR 4314
## 14   LAS 4082
## 15   HRL 3983
## 16   MCO 3687
## 17   BNA 3481
## 18   MCI 3174
## 19   OKC 3170
## 20   TPA 3085
## 21   ELP 3036
## 22   SAN 2936
## 23   TUL 2924
## 24   SFO 2818
## 25   ABQ 2812
## 26   BHM 2736
## 27   LGA 2730
## 28   DCA 2699
## 29   SEA 2615
## 30   DTW 2601
## 31   BWI 2551
## 32   STL 2509
## 33   MIA 2463
## 34   FLL 2462
## 35   MEM 2399
## 36   PHL 2367
## 37   LFT 2313
## 38   MAF 2306
## 39   CLE 2140
## 40   JAX 2135
## 41   MDW 2094
## 42   OMA 2044
## 43   SLC 2033
## 44   JAN 2011
## 45   MSP 2010
## 46   IAD 1980
## 47   BTR 1762
## 48   BOS 1752
## 49   IND 1750
## 50   RDU 1740
## 51   BRO 1692
## 52   MOB 1674
## 53   PIT 1664
## 54   SNA 1661
## 55   COS 1657
## 56   GPT 1618
## 57   MKE 1588
## 58   LIT 1579
## 59   TUS 1565
## 60   PNS 1539
## 61   CVG 1535
## 62   ICT 1517
## 63   CMH 1348
## 64   LBB 1333
## 65   AMA 1297
## 66   SDF 1279
## 67   PBI 1253
## 68   PDX 1235
## 69   TYS 1210
## 70   CHS 1200
## 71   LRD 1188
## 72   XNA 1172
## 73   MFE 1128
## 74   GSP 1123
## 75   SMF 1014
## 76   ONT  952
## 77   RSW  948
## 78   HSV  923
## 79   RIC  900
## 80   SJC  885
## 81   VPS  880
## 82   SAV  863
## 83   SHV  787
## 84   ECP  729
## 85   AEX  724
## 86   ORF  717
## 87   JFK  695
## 88   OAK  690
## 89   GRR  677
## 90   DSM  647
## 91   GSO  630
## 92   LEX  584
## 93   CAE  561
## 94   BFL  504
## 95   DAY  451
## 96   CID  410
## 97   GJT  403
## 98   HNL  402
## 99   SJU  391
## 100  LCH  364
## 101  CRW  357
## 102  AVL  350
## 103  HOB  309
## 104  MLU  292
## 105  RNO  243
## 106  MTJ  164
## 107  ANC  125
## 108  ASE  125
## 109  BKG  110
## 110  EGE  110
## 111  HDN  110
## 112  PSP  106
## 113  GUC   86
## 114  GRK   42
## 115  BPT    3
## 116  AGS    1

# gapminder data
library(gapminder)
glimpse(gapminder)

## Rows: 1,704
## Columns: 6
## $ country   <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
## $ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, …
## $ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, …
## $ lifeExp   <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8…
## $ pop       <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12…
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, …

gapminder %>% count(country)

## # A tibble: 142 × 2
##    country         n
##    <fct>       <int>
##  1 Afghanistan    12
##  2 Albania        12
##  3 Algeria        12
##  4 Angola         12
##  5 Argentina      12
##  6 Australia      12
##  7 Austria        12
##  8 Bahrain        12
##  9 Bangladesh     12
## 10 Belgium        12
## # ℹ 132 more rows

gapminder %>% count(continent)

## # A tibble: 5 × 2
##   continent     n
##   <fct>     <int>
## 1 Africa      624
## 2 Americas    300
## 3 Asia        396
## 4 Europe      360
## 5 Oceania      24

# 대륙별 평균 기대 수명
gapminder %>% group_by(continent) %>% summarize(m=mean(lifeExp))

## # A tibble: 5 × 2
##   continent     m
##   <fct>     <dbl>
## 1 Africa     48.9
## 2 Americas   64.7
## 3 Asia       60.1
## 4 Europe     71.9
## 5 Oceania    74.3

# 오름차순,내림차순
gapminder %>% group_by(continent) %>% summarize(m=mean(lifeExp)) %>% arrange()

## # A tibble: 5 × 2
##   continent     m
##   <fct>     <dbl>
## 1 Africa     48.9
## 2 Americas   64.7
## 3 Asia       60.1
## 4 Europe     71.9
## 5 Oceania    74.3

gapminder %>% group_by(continent) %>% summarize(m=mean(lifeExp)) %>% arrange(desc(m))

## # A tibble: 5 × 2
##   continent     m
##   <fct>     <dbl>
## 1 Oceania    74.3
## 2 Europe     71.9
## 3 Americas   64.7
## 4 Asia       60.1
## 5 Africa     48.9

gapminder %>% group_by(continent,country) %>% summarize(m1=mean(pop))

## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.

## # A tibble: 142 × 3
## # Groups:   continent [5]
##    continent country                         m1
##    <fct>     <fct>                        <dbl>
##  1 Africa    Algeria                  19875406.
##  2 Africa    Angola                    7309390.
##  3 Africa    Benin                     4017497.
##  4 Africa    Botswana                   971186.
##  5 Africa    Burkina Faso              7548677.
##  6 Africa    Burundi                   4651608.
##  7 Africa    Cameroon                  9816648.
##  8 Africa    Central African Republic  2560963 
##  9 Africa    Chad                      5329256.
## 10 Africa    Comoros                    361684.
## # ℹ 132 more rows

gapminder %>% group_by(continent,country) %>% summarize(m1=mean(pop)) %>% arrange(desc(m1))

## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.

## # A tibble: 142 × 3
## # Groups:   continent [5]
##    continent country               m1
##    <fct>     <fct>              <dbl>
##  1 Asia      China         958160052.
##  2 Asia      India         701130740.
##  3 Americas  United States 228211232.
##  4 Asia      Indonesia     148322833.
##  5 Americas  Brazil        122312127.
##  6 Asia      Japan         111758808 
##  7 Asia      Pakistan       93683386.
##  8 Asia      Bangladesh     90755395.
##  9 Europe    Germany        77547043.
## 10 Africa    Nigeria        73708018.
## # ℹ 132 more rows

tips<-read.csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv')
glimpse(tips)

## Rows: 244
## Columns: 7
## $ total_bill <dbl> 16.99, 10.34, 21.01, 23.68, 24.59, 25.29, 8.77, 26.88, 15.0…
## $ tip        <dbl> 1.01, 1.66, 3.50, 3.31, 3.61, 4.71, 2.00, 3.12, 1.96, 3.23,…
## $ sex        <chr> "Female", "Male", "Male", "Male", "Female", "Male", "Male",…
## $ smoker     <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",…
## $ day        <chr> "Sun", "Sun", "Sun", "Sun", "Sun", "Sun", "Sun", "Sun", "Su…
## $ time       <chr> "Dinner", "Dinner", "Dinner", "Dinner", "Dinner", "Dinner",…
## $ size       <int> 2, 3, 3, 2, 4, 4, 2, 4, 2, 2, 2, 4, 2, 4, 2, 2, 3, 3, 3, 3,…

# 계산서 금액 : total_bill
# 요일: day
# 팁 액수(tip)
# 시간:time
# 계산한 사람의 성별(sex)
# 동석자 수(size)
# 흡연한 사람의 여부(smoker)

summary(tips)

##    total_bill         tip             sex               smoker         
##  Min.   : 3.07   Min.   : 1.000   Length:244         Length:244        
##  1st Qu.:13.35   1st Qu.: 2.000   Class :character   Class :character  
##  Median :17.80   Median : 2.900   Mode  :character   Mode  :character  
##  Mean   :19.79   Mean   : 2.998                                        
##  3rd Qu.:24.13   3rd Qu.: 3.562                                        
##  Max.   :50.81   Max.   :10.000                                        
##      day                time                size     
##  Length:244         Length:244         Min.   :1.00  
##  Class :character   Class :character   1st Qu.:2.00  
##  Mode  :character   Mode  :character   Median :2.00  
##                                        Mean   :2.57  
##                                        3rd Qu.:3.00  
##                                        Max.   :6.00

library(dplyr)
library(ggplot2)

tips %>% ggplot(aes(size))+geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

tips %>% ggplot(aes(total_bill,tip))+geom_point()

tips %>% ggplot(aes(total_bill,tip))+geom_point(aes(col=day))

tips %>% ggplot(aes(total_bill,tip))+geom_point(aes(col=day,pch=sex),size=3)

# PT 교안 106
y<-gapminder %>% group_by(year,continent) %>% summarize(c_pop=sum(pop))

## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.

head(y,10)

## # A tibble: 10 × 3
## # Groups:   year [2]
##     year continent      c_pop
##    <int> <fct>          <dbl>
##  1  1952 Africa     237640501
##  2  1952 Americas   345152446
##  3  1952 Asia      1395357351
##  4  1952 Europe     418120846
##  5  1952 Oceania     10686006
##  6  1957 Africa     264837738
##  7  1957 Americas   386953916
##  8  1957 Asia      1562780599
##  9  1957 Europe     437890351
## 10  1957 Oceania     11941976

plot(y$year,y$c_pop)

plot(gapminder$gdpPercap,gapminder$lifeExp,col=gapminder$continent)
legend("bottomright",legend=levels((gapminder$continent)),
       pch=c(1:length(levels(gapminder$continent))),
       col=c(1:length(levels(y$continent))))

plot(log10(gapminder$gdpPercap),gapminder$lifeExp,col=gapminder$continent)
legend("bottomright",legend=levels((gapminder$continent)),
       pch=c(1:length(levels(gapminder$continent))),
       col=c(1:length(levels(y$continent))))

# ggplot
library(ggplot2)
ggplot(gapminder,aes(x=gdpPercap,y=lifeExp,col=continent))+geom_point()+
scale_x_log10()

ggplot(gapminder,aes(x=gdpPercap,y=lifeExp,col=continent,size=pop))+
  geom_point()+scale_x_log10()

ggplot(gapminder,aes(x=gdpPercap,y=lifeExp,col=continent,size=pop))+
  geom_point(alpha=0.5)+scale_x_log10()

ggplot(gapminder,aes(x=gdpPercap,y=lifeExp,col=continent,size=pop))+
  geom_point(alpha=0.5)+scale_x_log10()+facet_wrap(~year)

gapminder %>% filter(year==1952&continent=="Asia") %>% 
  ggplot(aes(reorder(country,pop),pop))+geom_bar(stat='identity')+coord_flip()

gapminder %>% filter(country=='Korea, Rep.')

## # A tibble: 12 × 6
##    country     continent  year lifeExp      pop gdpPercap
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Korea, Rep. Asia       1952    47.5 20947571     1031.
##  2 Korea, Rep. Asia       1957    52.7 22611552     1488.
##  3 Korea, Rep. Asia       1962    55.3 26420307     1536.
##  4 Korea, Rep. Asia       1967    57.7 30131000     2029.
##  5 Korea, Rep. Asia       1972    62.6 33505000     3031.
##  6 Korea, Rep. Asia       1977    64.8 36436000     4657.
##  7 Korea, Rep. Asia       1982    67.1 39326000     5623.
##  8 Korea, Rep. Asia       1987    69.8 41622000     8533.
##  9 Korea, Rep. Asia       1992    72.2 43805450    12104.
## 10 Korea, Rep. Asia       1997    74.6 46173816    15994.
## 11 Korea, Rep. Asia       2002    77.0 47969150    19234.
## 12 Korea, Rep. Asia       2007    78.6 49044790    23348.

gapminder %>% filter(country=='Korea, Rep.') %>% ggplot(aes(year,lifeExp,
      col=country))+geom_point()+geom_line()

gapminder %>% filter(country=='Korea, Rep.') %>% ggplot(aes(year,lifeExp,
      col=country))+geom_point(alpha=0.2)+geom_smooth()

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

gapminder%>%ggplot(aes(x=year,y=lifeExp,col=continent))+geom_point(alpha=0.2)+geom_smooth()

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

x<-gapminder %>% filter(year==1952)
hist(x$lifeExp,main="Histogram of lifeExp in 1952")

x %>% ggplot(aes(continent,lifeExp))+geom_boxplot()

ucla<-read.csv('https://stats.idre.ucla.edu/stat/data/binary.csv')
library(dplyr)
glimpse(ucla)

## Rows: 400
## Columns: 4
## $ admit <int> 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1…
## $ gre   <int> 380, 660, 800, 640, 520, 760, 560, 400, 540, 700, 800, 440, 760,…
## $ gpa   <dbl> 3.61, 3.67, 4.00, 3.19, 2.93, 3.00, 2.98, 3.08, 3.39, 3.92, 4.00…
## $ rank  <int> 3, 3, 1, 4, 4, 2, 1, 2, 3, 2, 4, 1, 1, 2, 1, 3, 4, 3, 2, 1, 3, 2…

ucla$admit<-as.factor(ucla$admit)
m<-glm(admit~.,data=ucla,family="binomial")
summary(m)

## 
## Call:
## glm(formula = admit ~ ., family = "binomial", data = ucla)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.449548   1.132846  -3.045  0.00233 ** 
## gre          0.002294   0.001092   2.101  0.03564 *  
## gpa          0.777014   0.327484   2.373  0.01766 *  
## rank        -0.560031   0.127137  -4.405 1.06e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 499.98  on 399  degrees of freedom
## Residual deviance: 459.44  on 396  degrees of freedom
## AIC: 467.44
## 
## Number of Fisher Scoring iterations: 4

exp(0.777014)

## [1] 2.174968

exp(-0.560031)

## [1] 0.5711914

s<-data.frame(gre=c(376),gpa=c(3.6),rank=c(3))
predict(m,newdata=s,type='response')

##         1 
## 0.1869631

도로교통공단 2일차

김도현

2024-01-29