# 파생변수 생성
setwd("C:/Data")
getwd()
## [1] "C:/Data"
library(dplyr)
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data(cars)
glimpse(cars)
## Rows: 50
## Columns: 2
## $ speed <dbl> 4, 4, 7, 7, 8, 9, 10, 10, 10, 11, 11, 12, 12, 12, 12, 13, 13, 13…
## $ dist <dbl> 2, 10, 4, 22, 16, 10, 18, 26, 34, 17, 28, 14, 20, 24, 28, 26, 34…
summary(cars) # 결측값이 존재하지 않음
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
plot(cars)
car_model<-lm(dist~speed,data=cars)
coef(car_model)
## (Intercept) speed
## -17.579095 3.932409
abline(car_model,col='red')

fitted(car_model)
## 1 2 3 4 5 6 7 8
## -1.849460 -1.849460 9.947766 9.947766 13.880175 17.812584 21.744993 21.744993
## 9 10 11 12 13 14 15 16
## 21.744993 25.677401 25.677401 29.609810 29.609810 29.609810 29.609810 33.542219
## 17 18 19 20 21 22 23 24
## 33.542219 33.542219 33.542219 37.474628 37.474628 37.474628 37.474628 41.407036
## 25 26 27 28 29 30 31 32
## 41.407036 41.407036 45.339445 45.339445 49.271854 49.271854 49.271854 53.204263
## 33 34 35 36 37 38 39 40
## 53.204263 53.204263 53.204263 57.136672 57.136672 57.136672 61.069080 61.069080
## 41 42 43 44 45 46 47 48
## 61.069080 61.069080 61.069080 68.933898 72.866307 76.798715 76.798715 76.798715
## 49 50
## 76.798715 80.731124
residuals(car_model)
## 1 2 3 4 5 6 7
## 3.849460 11.849460 -5.947766 12.052234 2.119825 -7.812584 -3.744993
## 8 9 10 11 12 13 14
## 4.255007 12.255007 -8.677401 2.322599 -15.609810 -9.609810 -5.609810
## 15 16 17 18 19 20 21
## -1.609810 -7.542219 0.457781 0.457781 12.457781 -11.474628 -1.474628
## 22 23 24 25 26 27 28
## 22.525372 42.525372 -21.407036 -15.407036 12.592964 -13.339445 -5.339445
## 29 30 31 32 33 34 35
## -17.271854 -9.271854 0.728146 -11.204263 2.795737 22.795737 30.795737
## 36 37 38 39 40 41 42
## -21.136672 -11.136672 10.863328 -29.069080 -13.069080 -9.069080 -5.069080
## 43 44 45 46 47 48 49
## 2.930920 -2.933898 -18.866307 -6.798715 15.201285 16.201285 43.201285
## 50
## 4.268876
nx1<-data.frame(speed=c(21.5))
predict(car_model,nx1)
## 1
## 66.96769
nx<-data.frame(speed=c(21.5,25.0,25.5,26.0,26.5,27.0,28.0))
plot(nx$speed,predict(car_model,nx),col='red',cex=2,pch=20)
abline(car_model)
cars1<-cars %>% mutate(violation=ifelse(cars$speed>15.4,"fast","slow"))
cars1
## speed dist violation
## 1 4 2 slow
## 2 4 10 slow
## 3 7 4 slow
## 4 7 22 slow
## 5 8 16 slow
## 6 9 10 slow
## 7 10 18 slow
## 8 10 26 slow
## 9 10 34 slow
## 10 11 17 slow
## 11 11 28 slow
## 12 12 14 slow
## 13 12 20 slow
## 14 12 24 slow
## 15 12 28 slow
## 16 13 26 slow
## 17 13 34 slow
## 18 13 34 slow
## 19 13 46 slow
## 20 14 26 slow
## 21 14 36 slow
## 22 14 60 slow
## 23 14 80 slow
## 24 15 20 slow
## 25 15 26 slow
## 26 15 54 slow
## 27 16 32 fast
## 28 16 40 fast
## 29 17 32 fast
## 30 17 40 fast
## 31 17 50 fast
## 32 18 42 fast
## 33 18 56 fast
## 34 18 76 fast
## 35 18 84 fast
## 36 19 36 fast
## 37 19 46 fast
## 38 19 68 fast
## 39 20 32 fast
## 40 20 48 fast
## 41 20 52 fast
## 42 20 56 fast
## 43 20 64 fast
## 44 22 66 fast
## 45 23 54 fast
## 46 24 70 fast
## 47 24 92 fast
## 48 24 93 fast
## 49 24 120 fast
## 50 25 85 fast
glimpse(cars1)
## Rows: 50
## Columns: 3
## $ speed <dbl> 4, 4, 7, 7, 8, 9, 10, 10, 10, 11, 11, 12, 12, 12, 12, 13, 13…
## $ dist <dbl> 2, 10, 4, 22, 16, 10, 18, 26, 34, 17, 28, 14, 20, 24, 28, 26…
## $ violation <chr> "slow", "slow", "slow", "slow", "slow", "slow", "slow", "slo…
cars1$violation<-factor(cars1$violation)
glimpse(cars1)
## Rows: 50
## Columns: 3
## $ speed <dbl> 4, 4, 7, 7, 8, 9, 10, 10, 10, 11, 11, 12, 12, 12, 12, 13, 13…
## $ dist <dbl> 2, 10, 4, 22, 16, 10, 18, 26, 34, 17, 28, 14, 20, 24, 28, 26…
## $ violation <fct> slow, slow, slow, slow, slow, slow, slow, slow, slow, slow, …
cars1 %>% group_by(violation) %>% summarize(p=mean(speed))
## # A tibble: 2 × 2
## violation p
## <fct> <dbl>
## 1 fast 19.9
## 2 slow 11.2
# hflights
library(hflights)
summary(hflights)
## Year Month DayofMonth DayOfWeek DepTime
## Min. :2011 Min. : 1.000 Min. : 1.00 Min. :1.000 Min. : 1
## 1st Qu.:2011 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.:2.000 1st Qu.:1021
## Median :2011 Median : 7.000 Median :16.00 Median :4.000 Median :1416
## Mean :2011 Mean : 6.514 Mean :15.74 Mean :3.948 Mean :1396
## 3rd Qu.:2011 3rd Qu.: 9.000 3rd Qu.:23.00 3rd Qu.:6.000 3rd Qu.:1801
## Max. :2011 Max. :12.000 Max. :31.00 Max. :7.000 Max. :2400
## NA's :2905
## ArrTime UniqueCarrier FlightNum TailNum
## Min. : 1 Length:227496 Min. : 1 Length:227496
## 1st Qu.:1215 Class :character 1st Qu.: 855 Class :character
## Median :1617 Mode :character Median :1696 Mode :character
## Mean :1578 Mean :1962
## 3rd Qu.:1953 3rd Qu.:2755
## Max. :2400 Max. :7290
## NA's :3066
## ActualElapsedTime AirTime ArrDelay DepDelay
## Min. : 34.0 Min. : 11.0 Min. :-70.000 Min. :-33.000
## 1st Qu.: 77.0 1st Qu.: 58.0 1st Qu.: -8.000 1st Qu.: -3.000
## Median :128.0 Median :107.0 Median : 0.000 Median : 0.000
## Mean :129.3 Mean :108.1 Mean : 7.094 Mean : 9.445
## 3rd Qu.:165.0 3rd Qu.:141.0 3rd Qu.: 11.000 3rd Qu.: 9.000
## Max. :575.0 Max. :549.0 Max. :978.000 Max. :981.000
## NA's :3622 NA's :3622 NA's :3622 NA's :2905
## Origin Dest Distance TaxiIn
## Length:227496 Length:227496 Min. : 79.0 Min. : 1.000
## Class :character Class :character 1st Qu.: 376.0 1st Qu.: 4.000
## Mode :character Mode :character Median : 809.0 Median : 5.000
## Mean : 787.8 Mean : 6.099
## 3rd Qu.:1042.0 3rd Qu.: 7.000
## Max. :3904.0 Max. :165.000
## NA's :3066
## TaxiOut Cancelled CancellationCode Diverted
## Min. : 1.00 Min. :0.00000 Length:227496 Min. :0.000000
## 1st Qu.: 10.00 1st Qu.:0.00000 Class :character 1st Qu.:0.000000
## Median : 14.00 Median :0.00000 Mode :character Median :0.000000
## Mean : 15.09 Mean :0.01307 Mean :0.002853
## 3rd Qu.: 18.00 3rd Qu.:0.00000 3rd Qu.:0.000000
## Max. :163.00 Max. :1.00000 Max. :1.000000
## NA's :2947
glimpse(hflights)
## Rows: 227,496
## Columns: 21
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011…
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ DayofMonth <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
## $ DayOfWeek <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2…
## $ DepTime <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 1355, 1443…
## $ ArrTime <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 1454, 1554…
## $ UniqueCarrier <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA"…
## $ FlightNum <int> 428, 428, 428, 428, 428, 428, 428, 428, 428, 428, 42…
## $ TailNum <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N492AA", "N…
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, 56, 63, …
## $ AirTime <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, 41, 44, …
## $ ArrDelay <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29, 5, -9, …
## $ DepDelay <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, -2, -3, …
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "IA…
## $ Dest <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "DF…
## $ Distance <int> 224, 224, 224, 224, 224, 224, 224, 224, 224, 224, 22…
## $ TaxiIn <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6, 12, 8,…
## $ TaxiOut <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11, 13, 15…
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ CancellationCode <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", …
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
# Month 빈도분석
hflights %>% count(Month)
## Month n
## 1 1 18910
## 2 2 17128
## 3 3 19470
## 4 4 18593
## 5 5 19172
## 6 6 19600
## 7 7 20548
## 8 8 20176
## 9 9 18065
## 10 10 18696
## 11 11 18021
## 12 12 19117
# Month가 2월 8월 데이터 추출
hflights %>% filter(Month==2|Month==8) %>% dim()
## [1] 37304 21
# Dest 도착지점(공항명)
glimpse(hflights)
## Rows: 227,496
## Columns: 21
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011…
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ DayofMonth <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
## $ DayOfWeek <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2…
## $ DepTime <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 1355, 1443…
## $ ArrTime <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 1454, 1554…
## $ UniqueCarrier <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA"…
## $ FlightNum <int> 428, 428, 428, 428, 428, 428, 428, 428, 428, 428, 42…
## $ TailNum <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N492AA", "N…
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, 56, 63, …
## $ AirTime <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, 41, 44, …
## $ ArrDelay <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29, 5, -9, …
## $ DepDelay <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, -2, -3, …
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "IA…
## $ Dest <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "DF…
## $ Distance <int> 224, 224, 224, 224, 224, 224, 224, 224, 224, 224, 22…
## $ TaxiIn <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6, 12, 8,…
## $ TaxiOut <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11, 13, 15…
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ CancellationCode <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", …
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
hflights %>% count(Dest)
## Dest n
## 1 ABQ 2812
## 2 AEX 724
## 3 AGS 1
## 4 AMA 1297
## 5 ANC 125
## 6 ASE 125
## 7 ATL 7886
## 8 AUS 5022
## 9 AVL 350
## 10 BFL 504
## 11 BHM 2736
## 12 BKG 110
## 13 BNA 3481
## 14 BOS 1752
## 15 BPT 3
## 16 BRO 1692
## 17 BTR 1762
## 18 BWI 2551
## 19 CAE 561
## 20 CHS 1200
## 21 CID 410
## 22 CLE 2140
## 23 CLT 4735
## 24 CMH 1348
## 25 COS 1657
## 26 CRP 4813
## 27 CRW 357
## 28 CVG 1535
## 29 DAL 9820
## 30 DAY 451
## 31 DCA 2699
## 32 DEN 5920
## 33 DFW 6653
## 34 DSM 647
## 35 DTW 2601
## 36 ECP 729
## 37 EGE 110
## 38 ELP 3036
## 39 EWR 4314
## 40 FLL 2462
## 41 GJT 403
## 42 GPT 1618
## 43 GRK 42
## 44 GRR 677
## 45 GSO 630
## 46 GSP 1123
## 47 GUC 86
## 48 HDN 110
## 49 HNL 402
## 50 HOB 309
## 51 HRL 3983
## 52 HSV 923
## 53 IAD 1980
## 54 ICT 1517
## 55 IND 1750
## 56 JAN 2011
## 57 JAX 2135
## 58 JFK 695
## 59 LAS 4082
## 60 LAX 6064
## 61 LBB 1333
## 62 LCH 364
## 63 LEX 584
## 64 LFT 2313
## 65 LGA 2730
## 66 LIT 1579
## 67 LRD 1188
## 68 MAF 2306
## 69 MCI 3174
## 70 MCO 3687
## 71 MDW 2094
## 72 MEM 2399
## 73 MFE 1128
## 74 MIA 2463
## 75 MKE 1588
## 76 MLU 292
## 77 MOB 1674
## 78 MSP 2010
## 79 MSY 6823
## 80 MTJ 164
## 81 OAK 690
## 82 OKC 3170
## 83 OMA 2044
## 84 ONT 952
## 85 ORD 5748
## 86 ORF 717
## 87 PBI 1253
## 88 PDX 1235
## 89 PHL 2367
## 90 PHX 5096
## 91 PIT 1664
## 92 PNS 1539
## 93 PSP 106
## 94 RDU 1740
## 95 RIC 900
## 96 RNO 243
## 97 RSW 948
## 98 SAN 2936
## 99 SAT 4893
## 100 SAV 863
## 101 SDF 1279
## 102 SEA 2615
## 103 SFO 2818
## 104 SHV 787
## 105 SJC 885
## 106 SJU 391
## 107 SLC 2033
## 108 SMF 1014
## 109 SNA 1661
## 110 STL 2509
## 111 TPA 3085
## 112 TUL 2924
## 113 TUS 1565
## 114 TYS 1210
## 115 VPS 880
## 116 XNA 1172
# 최댓값, 최솟값 찾기
hflights %>% count(Dest) %>% arrange(n)
## Dest n
## 1 AGS 1
## 2 BPT 3
## 3 GRK 42
## 4 GUC 86
## 5 PSP 106
## 6 BKG 110
## 7 EGE 110
## 8 HDN 110
## 9 ANC 125
## 10 ASE 125
## 11 MTJ 164
## 12 RNO 243
## 13 MLU 292
## 14 HOB 309
## 15 AVL 350
## 16 CRW 357
## 17 LCH 364
## 18 SJU 391
## 19 HNL 402
## 20 GJT 403
## 21 CID 410
## 22 DAY 451
## 23 BFL 504
## 24 CAE 561
## 25 LEX 584
## 26 GSO 630
## 27 DSM 647
## 28 GRR 677
## 29 OAK 690
## 30 JFK 695
## 31 ORF 717
## 32 AEX 724
## 33 ECP 729
## 34 SHV 787
## 35 SAV 863
## 36 VPS 880
## 37 SJC 885
## 38 RIC 900
## 39 HSV 923
## 40 RSW 948
## 41 ONT 952
## 42 SMF 1014
## 43 GSP 1123
## 44 MFE 1128
## 45 XNA 1172
## 46 LRD 1188
## 47 CHS 1200
## 48 TYS 1210
## 49 PDX 1235
## 50 PBI 1253
## 51 SDF 1279
## 52 AMA 1297
## 53 LBB 1333
## 54 CMH 1348
## 55 ICT 1517
## 56 CVG 1535
## 57 PNS 1539
## 58 TUS 1565
## 59 LIT 1579
## 60 MKE 1588
## 61 GPT 1618
## 62 COS 1657
## 63 SNA 1661
## 64 PIT 1664
## 65 MOB 1674
## 66 BRO 1692
## 67 RDU 1740
## 68 IND 1750
## 69 BOS 1752
## 70 BTR 1762
## 71 IAD 1980
## 72 MSP 2010
## 73 JAN 2011
## 74 SLC 2033
## 75 OMA 2044
## 76 MDW 2094
## 77 JAX 2135
## 78 CLE 2140
## 79 MAF 2306
## 80 LFT 2313
## 81 PHL 2367
## 82 MEM 2399
## 83 FLL 2462
## 84 MIA 2463
## 85 STL 2509
## 86 BWI 2551
## 87 DTW 2601
## 88 SEA 2615
## 89 DCA 2699
## 90 LGA 2730
## 91 BHM 2736
## 92 ABQ 2812
## 93 SFO 2818
## 94 TUL 2924
## 95 SAN 2936
## 96 ELP 3036
## 97 TPA 3085
## 98 OKC 3170
## 99 MCI 3174
## 100 BNA 3481
## 101 MCO 3687
## 102 HRL 3983
## 103 LAS 4082
## 104 EWR 4314
## 105 CLT 4735
## 106 CRP 4813
## 107 SAT 4893
## 108 AUS 5022
## 109 PHX 5096
## 110 ORD 5748
## 111 DEN 5920
## 112 LAX 6064
## 113 DFW 6653
## 114 MSY 6823
## 115 ATL 7886
## 116 DAL 9820
hflights %>% count(Dest) %>% arrange(desc(n))
## Dest n
## 1 DAL 9820
## 2 ATL 7886
## 3 MSY 6823
## 4 DFW 6653
## 5 LAX 6064
## 6 DEN 5920
## 7 ORD 5748
## 8 PHX 5096
## 9 AUS 5022
## 10 SAT 4893
## 11 CRP 4813
## 12 CLT 4735
## 13 EWR 4314
## 14 LAS 4082
## 15 HRL 3983
## 16 MCO 3687
## 17 BNA 3481
## 18 MCI 3174
## 19 OKC 3170
## 20 TPA 3085
## 21 ELP 3036
## 22 SAN 2936
## 23 TUL 2924
## 24 SFO 2818
## 25 ABQ 2812
## 26 BHM 2736
## 27 LGA 2730
## 28 DCA 2699
## 29 SEA 2615
## 30 DTW 2601
## 31 BWI 2551
## 32 STL 2509
## 33 MIA 2463
## 34 FLL 2462
## 35 MEM 2399
## 36 PHL 2367
## 37 LFT 2313
## 38 MAF 2306
## 39 CLE 2140
## 40 JAX 2135
## 41 MDW 2094
## 42 OMA 2044
## 43 SLC 2033
## 44 JAN 2011
## 45 MSP 2010
## 46 IAD 1980
## 47 BTR 1762
## 48 BOS 1752
## 49 IND 1750
## 50 RDU 1740
## 51 BRO 1692
## 52 MOB 1674
## 53 PIT 1664
## 54 SNA 1661
## 55 COS 1657
## 56 GPT 1618
## 57 MKE 1588
## 58 LIT 1579
## 59 TUS 1565
## 60 PNS 1539
## 61 CVG 1535
## 62 ICT 1517
## 63 CMH 1348
## 64 LBB 1333
## 65 AMA 1297
## 66 SDF 1279
## 67 PBI 1253
## 68 PDX 1235
## 69 TYS 1210
## 70 CHS 1200
## 71 LRD 1188
## 72 XNA 1172
## 73 MFE 1128
## 74 GSP 1123
## 75 SMF 1014
## 76 ONT 952
## 77 RSW 948
## 78 HSV 923
## 79 RIC 900
## 80 SJC 885
## 81 VPS 880
## 82 SAV 863
## 83 SHV 787
## 84 ECP 729
## 85 AEX 724
## 86 ORF 717
## 87 JFK 695
## 88 OAK 690
## 89 GRR 677
## 90 DSM 647
## 91 GSO 630
## 92 LEX 584
## 93 CAE 561
## 94 BFL 504
## 95 DAY 451
## 96 CID 410
## 97 GJT 403
## 98 HNL 402
## 99 SJU 391
## 100 LCH 364
## 101 CRW 357
## 102 AVL 350
## 103 HOB 309
## 104 MLU 292
## 105 RNO 243
## 106 MTJ 164
## 107 ANC 125
## 108 ASE 125
## 109 BKG 110
## 110 EGE 110
## 111 HDN 110
## 112 PSP 106
## 113 GUC 86
## 114 GRK 42
## 115 BPT 3
## 116 AGS 1
# gapminder data
library(gapminder)
glimpse(gapminder)
## Rows: 1,704
## Columns: 6
## $ country <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
## $ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, …
## $ year <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, …
## $ lifeExp <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8…
## $ pop <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12…
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, …
gapminder %>% count(country)
## # A tibble: 142 × 2
## country n
## <fct> <int>
## 1 Afghanistan 12
## 2 Albania 12
## 3 Algeria 12
## 4 Angola 12
## 5 Argentina 12
## 6 Australia 12
## 7 Austria 12
## 8 Bahrain 12
## 9 Bangladesh 12
## 10 Belgium 12
## # ℹ 132 more rows
gapminder %>% count(continent)
## # A tibble: 5 × 2
## continent n
## <fct> <int>
## 1 Africa 624
## 2 Americas 300
## 3 Asia 396
## 4 Europe 360
## 5 Oceania 24
# 대륙별 평균 기대 수명
gapminder %>% group_by(continent) %>% summarize(m=mean(lifeExp))
## # A tibble: 5 × 2
## continent m
## <fct> <dbl>
## 1 Africa 48.9
## 2 Americas 64.7
## 3 Asia 60.1
## 4 Europe 71.9
## 5 Oceania 74.3
# 오름차순,내림차순
gapminder %>% group_by(continent) %>% summarize(m=mean(lifeExp)) %>% arrange()
## # A tibble: 5 × 2
## continent m
## <fct> <dbl>
## 1 Africa 48.9
## 2 Americas 64.7
## 3 Asia 60.1
## 4 Europe 71.9
## 5 Oceania 74.3
gapminder %>% group_by(continent) %>% summarize(m=mean(lifeExp)) %>% arrange(desc(m))
## # A tibble: 5 × 2
## continent m
## <fct> <dbl>
## 1 Oceania 74.3
## 2 Europe 71.9
## 3 Americas 64.7
## 4 Asia 60.1
## 5 Africa 48.9
gapminder %>% group_by(continent,country) %>% summarize(m1=mean(pop))
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.
## # A tibble: 142 × 3
## # Groups: continent [5]
## continent country m1
## <fct> <fct> <dbl>
## 1 Africa Algeria 19875406.
## 2 Africa Angola 7309390.
## 3 Africa Benin 4017497.
## 4 Africa Botswana 971186.
## 5 Africa Burkina Faso 7548677.
## 6 Africa Burundi 4651608.
## 7 Africa Cameroon 9816648.
## 8 Africa Central African Republic 2560963
## 9 Africa Chad 5329256.
## 10 Africa Comoros 361684.
## # ℹ 132 more rows
gapminder %>% group_by(continent,country) %>% summarize(m1=mean(pop)) %>% arrange(desc(m1))
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.
## # A tibble: 142 × 3
## # Groups: continent [5]
## continent country m1
## <fct> <fct> <dbl>
## 1 Asia China 958160052.
## 2 Asia India 701130740.
## 3 Americas United States 228211232.
## 4 Asia Indonesia 148322833.
## 5 Americas Brazil 122312127.
## 6 Asia Japan 111758808
## 7 Asia Pakistan 93683386.
## 8 Asia Bangladesh 90755395.
## 9 Europe Germany 77547043.
## 10 Africa Nigeria 73708018.
## # ℹ 132 more rows
tips<-read.csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv')
glimpse(tips)
## Rows: 244
## Columns: 7
## $ total_bill <dbl> 16.99, 10.34, 21.01, 23.68, 24.59, 25.29, 8.77, 26.88, 15.0…
## $ tip <dbl> 1.01, 1.66, 3.50, 3.31, 3.61, 4.71, 2.00, 3.12, 1.96, 3.23,…
## $ sex <chr> "Female", "Male", "Male", "Male", "Female", "Male", "Male",…
## $ smoker <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",…
## $ day <chr> "Sun", "Sun", "Sun", "Sun", "Sun", "Sun", "Sun", "Sun", "Su…
## $ time <chr> "Dinner", "Dinner", "Dinner", "Dinner", "Dinner", "Dinner",…
## $ size <int> 2, 3, 3, 2, 4, 4, 2, 4, 2, 2, 2, 4, 2, 4, 2, 2, 3, 3, 3, 3,…
# 계산서 금액 : total_bill
# 요일: day
# 팁 액수(tip)
# 시간:time
# 계산한 사람의 성별(sex)
# 동석자 수(size)
# 흡연한 사람의 여부(smoker)
summary(tips)
## total_bill tip sex smoker
## Min. : 3.07 Min. : 1.000 Length:244 Length:244
## 1st Qu.:13.35 1st Qu.: 2.000 Class :character Class :character
## Median :17.80 Median : 2.900 Mode :character Mode :character
## Mean :19.79 Mean : 2.998
## 3rd Qu.:24.13 3rd Qu.: 3.562
## Max. :50.81 Max. :10.000
## day time size
## Length:244 Length:244 Min. :1.00
## Class :character Class :character 1st Qu.:2.00
## Mode :character Mode :character Median :2.00
## Mean :2.57
## 3rd Qu.:3.00
## Max. :6.00
library(dplyr)
library(ggplot2)

tips %>% ggplot(aes(size))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

tips %>% ggplot(aes(total_bill,tip))+geom_point()

tips %>% ggplot(aes(total_bill,tip))+geom_point(aes(col=day))

tips %>% ggplot(aes(total_bill,tip))+geom_point(aes(col=day,pch=sex),size=3)

# PT 교안 106
y<-gapminder %>% group_by(year,continent) %>% summarize(c_pop=sum(pop))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
head(y,10)
## # A tibble: 10 × 3
## # Groups: year [2]
## year continent c_pop
## <int> <fct> <dbl>
## 1 1952 Africa 237640501
## 2 1952 Americas 345152446
## 3 1952 Asia 1395357351
## 4 1952 Europe 418120846
## 5 1952 Oceania 10686006
## 6 1957 Africa 264837738
## 7 1957 Americas 386953916
## 8 1957 Asia 1562780599
## 9 1957 Europe 437890351
## 10 1957 Oceania 11941976
plot(y$year,y$c_pop)

plot(gapminder$gdpPercap,gapminder$lifeExp,col=gapminder$continent)
legend("bottomright",legend=levels((gapminder$continent)),
pch=c(1:length(levels(gapminder$continent))),
col=c(1:length(levels(y$continent))))

plot(log10(gapminder$gdpPercap),gapminder$lifeExp,col=gapminder$continent)
legend("bottomright",legend=levels((gapminder$continent)),
pch=c(1:length(levels(gapminder$continent))),
col=c(1:length(levels(y$continent))))

# ggplot
library(ggplot2)
ggplot(gapminder,aes(x=gdpPercap,y=lifeExp,col=continent))+geom_point()+
scale_x_log10()

ggplot(gapminder,aes(x=gdpPercap,y=lifeExp,col=continent,size=pop))+
geom_point()+scale_x_log10()

ggplot(gapminder,aes(x=gdpPercap,y=lifeExp,col=continent,size=pop))+
geom_point(alpha=0.5)+scale_x_log10()

ggplot(gapminder,aes(x=gdpPercap,y=lifeExp,col=continent,size=pop))+
geom_point(alpha=0.5)+scale_x_log10()+facet_wrap(~year)

gapminder %>% filter(year==1952&continent=="Asia") %>%
ggplot(aes(reorder(country,pop),pop))+geom_bar(stat='identity')+coord_flip()

gapminder %>% filter(country=='Korea, Rep.')
## # A tibble: 12 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Korea, Rep. Asia 1952 47.5 20947571 1031.
## 2 Korea, Rep. Asia 1957 52.7 22611552 1488.
## 3 Korea, Rep. Asia 1962 55.3 26420307 1536.
## 4 Korea, Rep. Asia 1967 57.7 30131000 2029.
## 5 Korea, Rep. Asia 1972 62.6 33505000 3031.
## 6 Korea, Rep. Asia 1977 64.8 36436000 4657.
## 7 Korea, Rep. Asia 1982 67.1 39326000 5623.
## 8 Korea, Rep. Asia 1987 69.8 41622000 8533.
## 9 Korea, Rep. Asia 1992 72.2 43805450 12104.
## 10 Korea, Rep. Asia 1997 74.6 46173816 15994.
## 11 Korea, Rep. Asia 2002 77.0 47969150 19234.
## 12 Korea, Rep. Asia 2007 78.6 49044790 23348.
gapminder %>% filter(country=='Korea, Rep.') %>% ggplot(aes(year,lifeExp,
col=country))+geom_point()+geom_line()

gapminder %>% filter(country=='Korea, Rep.') %>% ggplot(aes(year,lifeExp,
col=country))+geom_point(alpha=0.2)+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

gapminder%>%ggplot(aes(x=year,y=lifeExp,col=continent))+geom_point(alpha=0.2)+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

x<-gapminder %>% filter(year==1952)
hist(x$lifeExp,main="Histogram of lifeExp in 1952")

x %>% ggplot(aes(continent,lifeExp))+geom_boxplot()

ucla<-read.csv('https://stats.idre.ucla.edu/stat/data/binary.csv')
library(dplyr)
glimpse(ucla)
## Rows: 400
## Columns: 4
## $ admit <int> 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1…
## $ gre <int> 380, 660, 800, 640, 520, 760, 560, 400, 540, 700, 800, 440, 760,…
## $ gpa <dbl> 3.61, 3.67, 4.00, 3.19, 2.93, 3.00, 2.98, 3.08, 3.39, 3.92, 4.00…
## $ rank <int> 3, 3, 1, 4, 4, 2, 1, 2, 3, 2, 4, 1, 1, 2, 1, 3, 4, 3, 2, 1, 3, 2…
ucla$admit<-as.factor(ucla$admit)
m<-glm(admit~.,data=ucla,family="binomial")
summary(m)
##
## Call:
## glm(formula = admit ~ ., family = "binomial", data = ucla)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.449548 1.132846 -3.045 0.00233 **
## gre 0.002294 0.001092 2.101 0.03564 *
## gpa 0.777014 0.327484 2.373 0.01766 *
## rank -0.560031 0.127137 -4.405 1.06e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 499.98 on 399 degrees of freedom
## Residual deviance: 459.44 on 396 degrees of freedom
## AIC: 467.44
##
## Number of Fisher Scoring iterations: 4
exp(0.777014)
## [1] 2.174968
exp(-0.560031)
## [1] 0.5711914
s<-data.frame(gre=c(376),gpa=c(3.6),rank=c(3))
predict(m,newdata=s,type='response')
## 1
## 0.1869631