bigdata

library(dplyr)

## 
## 다음의 패키지를 부착합니다: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

house<-read.csv("housing.csv")
nrow(house)

## [1] 20640

house %>% glimpse

## Rows: 20,640
## Columns: 10
## $ longitude          <dbl> -122.23, -122.22, -122.24, -122.25, -122.25, -122.2…
## $ latitude           <dbl> 37.88, 37.86, 37.85, 37.85, 37.85, 37.85, 37.84, 37…
## $ housing_median_age <int> 41, 21, 52, 52, 52, 52, 52, 52, 42, 52, 52, 52, 52,…
## $ total_rooms        <int> 880, 7099, 1467, 1274, 1627, 919, 2535, 3104, 2555,…
## $ total_bedrooms     <int> 129, 1106, 190, 235, 280, 213, 489, 687, 665, 707, …
## $ population         <int> 322, 2401, 496, 558, 565, 413, 1094, 1157, 1206, 15…
## $ households         <int> 126, 1138, 177, 219, 259, 193, 514, 647, 595, 714, …
## $ median_income      <dbl> 8.3252, 8.3014, 7.2574, 5.6431, 3.8462, 4.0368, 3.6…
## $ median_house_value <int> 452600, 358500, 352100, 341300, 342200, 269700, 299…
## $ ocean_proximity    <chr> "NEAR BAY", "NEAR BAY", "NEAR BAY", "NEAR BAY", "NE…

#80%만 추출
rownum<-nrow(house)*0.8
house1<-house[1:rownum,]
house1 %>% glimpse

## Rows: 16,512
## Columns: 10
## $ longitude          <dbl> -122.23, -122.22, -122.24, -122.25, -122.25, -122.2…
## $ latitude           <dbl> 37.88, 37.86, 37.85, 37.85, 37.85, 37.85, 37.84, 37…
## $ housing_median_age <int> 41, 21, 52, 52, 52, 52, 52, 52, 42, 52, 52, 52, 52,…
## $ total_rooms        <int> 880, 7099, 1467, 1274, 1627, 919, 2535, 3104, 2555,…
## $ total_bedrooms     <int> 129, 1106, 190, 235, 280, 213, 489, 687, 665, 707, …
## $ population         <int> 322, 2401, 496, 558, 565, 413, 1094, 1157, 1206, 15…
## $ households         <int> 126, 1138, 177, 219, 259, 193, 514, 647, 595, 714, …
## $ median_income      <dbl> 8.3252, 8.3014, 7.2574, 5.6431, 3.8462, 4.0368, 3.6…
## $ median_house_value <int> 452600, 358500, 352100, 341300, 342200, 269700, 299…
## $ ocean_proximity    <chr> "NEAR BAY", "NEAR BAY", "NEAR BAY", "NEAR BAY", "NE…

#결측치 확인
colSums(is.na(house1))

##          longitude           latitude housing_median_age        total_rooms 
##                  0                  0                  0                  0 
##     total_bedrooms         population         households      median_income 
##                159                  0                  0                  0 
## median_house_value    ocean_proximity 
##                  0                  0

#결측치 대체 전 표준편차 구하기
df1<-sd(house1$total_bedrooms,na.rm = TRUE)
df1

## [1] 435.9006

#결측치를 대체하기 위한 중위수를 구하기
df2<-median(house1$total_bedrooms,na.rm = TRUE)
df2

## [1] 436

#결측치를 중위수로 대체
house1$total_bedrooms<-ifelse(is.na(house1$total_bedrooms),df2,
                              house1$total_bedrooms)
#결측치 재확인
colSums(is.na(house1))

##          longitude           latitude housing_median_age        total_rooms 
##                  0                  0                  0                  0 
##     total_bedrooms         population         households      median_income 
##                  0                  0                  0                  0 
## median_house_value    ocean_proximity 
##                  0                  0

#결측치 대체후의 표준편차 구하고 df4에 두표준편차의 차이를 저장
df3<-sd(house1$total_bedrooms)
df3

## [1] 433.9254

df4<-df1-df3
#답안제출방식 두가지 print와 cat
print(df4)

## [1] 1.975147

cat(df4)

## 1.975147

colSums(is.na(house))

##          longitude           latitude housing_median_age        total_rooms 
##                  0                  0                  0                  0 
##     total_bedrooms         population         households      median_income 
##                207                  0                  0                  0 
## median_house_value    ocean_proximity 
##                  0                  0

house<-house %>% filter(!is.na(total_bedrooms))
colSums(is.na(house))

##          longitude           latitude housing_median_age        total_rooms 
##                  0                  0                  0                  0 
##     total_bedrooms         population         households      median_income 
##                  0                  0                  0                  0 
## median_house_value    ocean_proximity 
##                  0                  0

rownum<-nrow(house)*0.7
house2<-house[1:rownum,]
quantile(house2$housing_median_age)

##   0%  25%  50%  75% 100% 
##    1   19   30   38   52

df<-quantile(house2$housing_median_age)[[2]]
print(df)

## [1] 19

cat(df)

## 19

titanic<-read.csv("train100.csv")
titanic %>% glimpse

## Rows: 891
## Columns: 11
## $ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
## $ Survived    <int> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1…
## $ Pclass      <int> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3, 3…
## $ Name        <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (Fl…
## $ Sex         <chr> "male", "female", "female", "female", "male", "male", "mal…
## $ Age         <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 14, …
## $ SibSp       <int> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1, 0…
## $ Parch       <int> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0, 0…
## $ Ticket      <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", "37…
## $ Fare        <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.8625,…
## $ Embarked    <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", "S", "S"…

colSums(is.na(titanic))

## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0           0           0           0           0         177 
##       SibSp       Parch      Ticket        Fare    Embarked 
##           0           0           0           0           0

titanic$Embarked<-as.factor(titanic$Embarked)
titanic$Sex<-as.factor(titanic$Sex)
titanic$Pclass<-as.factor(titanic$Pclass)
summary(titanic)

##   PassengerId       Survived      Pclass      Name               Sex     
##  Min.   :  1.0   Min.   :0.0000   1:216   Length:891         female:314  
##  1st Qu.:223.5   1st Qu.:0.0000   2:184   Class :character   male  :577  
##  Median :446.0   Median :0.0000   3:491   Mode  :character               
##  Mean   :446.0   Mean   :0.3838                                          
##  3rd Qu.:668.5   3rd Qu.:1.0000                                          
##  Max.   :891.0   Max.   :1.0000                                          
##                                                                          
##       Age            SibSp           Parch           Ticket         
##  Min.   : 0.42   Min.   :0.000   Min.   :0.0000   Length:891        
##  1st Qu.:20.12   1st Qu.:0.000   1st Qu.:0.0000   Class :character  
##  Median :28.00   Median :0.000   Median :0.0000   Mode  :character  
##  Mean   :29.70   Mean   :0.523   Mean   :0.3816                     
##  3rd Qu.:38.00   3rd Qu.:1.000   3rd Qu.:0.0000                     
##  Max.   :80.00   Max.   :8.000   Max.   :6.0000                     
##  NA's   :177                                                        
##       Fare        Embarked
##  Min.   :  0.00    :  2   
##  1st Qu.:  7.91   C:168   
##  Median : 14.45   Q: 77   
##  Mean   : 32.20   S:644   
##  3rd Qu.: 31.00           
##  Max.   :512.33           
##

table(titanic$Embarked)

## 
##       C   Q   S 
##   2 168  77 644

df<-nrow(titanic)
titanic %>% filter(is.na(Age)|Age=='') %>% summarise(n=n()) %>% 
  mutate(pct=n/df*100)->df1
titanic %>% filter(is.na(Embarked)|Embarked=='') %>% summarise(n=n()) %>% 
  mutate(pct=n/df*100)->df2
df1;df2

##     n      pct
## 1 177 19.86532

##   n       pct
## 1 2 0.2244669

names(titanic)[6]->df3
print(df3)

## [1] "Age"

cat(df3)

## Age

library(MASS)

## 
## 다음의 패키지를 부착합니다: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

data("Boston")
Boston %>% glimpse()

## Rows: 506
## Columns: 14
## $ crim    <dbl> 0.00632, 0.02731, 0.02729, 0.03237, 0.06905, 0.02985, 0.08829,…
## $ zn      <dbl> 18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.5, 12.5, 12.5, 12.5, 12.5, 1…
## $ indus   <dbl> 2.31, 7.07, 7.07, 2.18, 2.18, 2.18, 7.87, 7.87, 7.87, 7.87, 7.…
## $ chas    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ nox     <dbl> 0.538, 0.469, 0.469, 0.458, 0.458, 0.458, 0.524, 0.524, 0.524,…
## $ rm      <dbl> 6.575, 6.421, 7.185, 6.998, 7.147, 6.430, 6.012, 6.172, 5.631,…
## $ age     <dbl> 65.2, 78.9, 61.1, 45.8, 54.2, 58.7, 66.6, 96.1, 100.0, 85.9, 9…
## $ dis     <dbl> 4.0900, 4.9671, 4.9671, 6.0622, 6.0622, 6.0622, 5.5605, 5.9505…
## $ rad     <int> 1, 2, 2, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,…
## $ tax     <dbl> 296, 242, 242, 222, 222, 222, 311, 311, 311, 311, 311, 311, 31…
## $ ptratio <dbl> 15.3, 17.8, 17.8, 18.7, 18.7, 18.7, 15.2, 15.2, 15.2, 15.2, 15…
## $ black   <dbl> 396.90, 396.90, 392.83, 394.63, 396.90, 394.12, 395.60, 396.90…
## $ lstat   <dbl> 4.98, 9.14, 4.03, 2.94, 5.33, 5.21, 12.43, 19.15, 29.93, 17.10…
## $ medv    <dbl> 24.0, 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15…

boston1<-Boston %>% arrange(desc(crim))
boston1 %>% head

##      crim zn indus chas   nox    rm   age    dis rad tax ptratio  black lstat
## 1 88.9762  0  18.1    0 0.671 6.968  91.9 1.4165  24 666    20.2 396.90 17.21
## 2 73.5341  0  18.1    0 0.679 5.957 100.0 1.8026  24 666    20.2  16.45 20.62
## 3 67.9208  0  18.1    0 0.693 5.683 100.0 1.4254  24 666    20.2 384.97 22.98
## 4 51.1358  0  18.1    0 0.597 5.757 100.0 1.4130  24 666    20.2   2.60 10.11
## 5 45.7461  0  18.1    0 0.693 4.519 100.0 1.6582  24 666    20.2  88.27 36.98
## 6 41.5292  0  18.1    0 0.693 5.531  85.4 1.6074  24 666    20.2 329.46 27.38
##   medv
## 1 10.4
## 2  8.8
## 3  5.0
## 4 15.0
## 5  7.0
## 6  8.5

boston1$crim[10]

## [1] 25.9406

boston1$crim[1:10]<-25.9406
boston1 %>% head(10)

##       crim zn indus chas   nox    rm   age    dis rad tax ptratio  black lstat
## 1  25.9406  0  18.1    0 0.671 6.968  91.9 1.4165  24 666    20.2 396.90 17.21
## 2  25.9406  0  18.1    0 0.679 5.957 100.0 1.8026  24 666    20.2  16.45 20.62
## 3  25.9406  0  18.1    0 0.693 5.683 100.0 1.4254  24 666    20.2 384.97 22.98
## 4  25.9406  0  18.1    0 0.597 5.757 100.0 1.4130  24 666    20.2   2.60 10.11
## 5  25.9406  0  18.1    0 0.693 4.519 100.0 1.6582  24 666    20.2  88.27 36.98
## 6  25.9406  0  18.1    0 0.693 5.531  85.4 1.6074  24 666    20.2 329.46 27.38
## 7  25.9406  0  18.1    0 0.693 5.453 100.0 1.4896  24 666    20.2 396.90 30.59
## 8  25.9406  0  18.1    0 0.679 6.202  78.7 1.8629  24 666    20.2  18.82 14.52
## 9  25.9406  0  18.1    0 0.597 5.155 100.0 1.5894  24 666    20.2 210.97 20.08
## 10 25.9406  0  18.1    0 0.679 5.304  89.1 1.6475  24 666    20.2 127.36 26.64
##    medv
## 1  10.4
## 2   8.8
## 3   5.0
## 4  15.0
## 5   7.0
## 6   8.5
## 7   5.0
## 8  10.9
## 9  16.3
## 10 10.4

select<-dplyr::select
boston1 %>% filter(age>=80) %>% select(crim) %>% summarise(m=mean(crim))->df
df

##          m
## 1 5.759387

print(df[[1]])

## [1] 5.759387

cat(df[[1]])

## 5.759387

insurance<-read.csv("insurance.csv")
insurance %>% glimpse

## Rows: 1,338
## Columns: 7
## $ age      <int> 19, 18, 28, 33, 32, 31, 46, 37, 37, 60, 25, 62, 23, 56, 27, 1…
## $ sex      <chr> "female", "male", "male", "male", "male", "female", "female",…
## $ bmi      <dbl> 27.900, 33.770, 33.000, 22.705, 28.880, 25.740, 33.440, 27.74…
## $ children <int> 0, 1, 3, 0, 0, 0, 1, 3, 2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0…
## $ smoker   <chr> "yes", "no", "no", "no", "no", "no", "no", "no", "no", "no", …
## $ region   <chr> "southwest", "southeast", "southeast", "northwest", "northwes…
## $ charges  <dbl> 16884.924, 1725.552, 4449.462, 21984.471, 3866.855, 3756.622,…

colSums(is.na(insurance))

##      age      sex      bmi children   smoker   region  charges 
##        0        0        0        0        0        0        0

avg=mean(insurance$charges)
avg

## [1] 13270.42

sd=sd(insurance$charges)
sd

## [1] 12110.01

insurance1<-insurance %>% filter(charges>=avg+1.5*sd|charges<=avg-1.5*sd)
sum(insurance1$charges)

## [1] 6421430

print(sum(insurance1$charges))

## [1] 6421430

cat(sum(insurance1$charges))

## 6421430

df<-read.csv("disease.csv")
df %>% glimpse

## Rows: 4
## Columns: 194
## $ year                         <int> 1999, 2000, 2001, 2002
## $ Afghanistan                  <int> 0, 0, 0, 0
## $ Albania                      <dbl> 89.0, 132.0, 54.0, 4.9
## $ Algeria                      <dbl> 25.0, 0.0, 14.0, 0.7
## $ Andorra                      <dbl> 245.0, 138.0, 312.0, 12.4
## $ Angola                       <dbl> 217.0, 57.0, 45.0, 5.9
## $ Antigua...Barbuda            <dbl> 102.0, 128.0, 45.0, 4.9
## $ Argentina                    <dbl> 193.0, 25.0, 221.0, 8.3
## $ Armenia                      <dbl> 21.0, 179.0, 11.0, 3.8
## $ Australia                    <dbl> 261.0, 72.0, 212.0, 10.4
## $ Austria                      <dbl> 279.0, 75.0, 191.0, 9.7
## $ Azerbaijan                   <dbl> 21.0, 46.0, 5.0, 1.3
## $ Bahamas                      <dbl> 122.0, 176.0, 51.0, 6.3
## $ Bahrain                      <int> 42, 63, 7, 2
## $ Bangladesh                   <int> 0, 0, 0, 0
## $ Barbados                     <dbl> 143.0, 173.0, 36.0, 6.3
## $ Belarus                      <dbl> 142.0, 373.0, 42.0, 14.4
## $ Belgium                      <dbl> 295.0, 84.0, 212.0, 10.5
## $ Belize                       <dbl> 263.0, 114.0, 8.0, 6.8
## $ Benin                        <dbl> 34.0, 4.0, 13.0, 1.1
## $ Bhutan                       <dbl> 23.0, 0.0, 0.0, 0.4
## $ Bolivia                      <dbl> 167.0, 41.0, 8.0, 3.8
## $ Bosnia.Herzegovina           <dbl> 76.0, 173.0, 8.0, 4.6
## $ Botswana                     <dbl> 173.0, 35.0, 35.0, 5.4
## $ Brazil                       <dbl> 245.0, 145.0, 16.0, 7.2
## $ Brunei                       <dbl> 31.0, 2.0, 1.0, 0.6
## $ Bulgaria                     <dbl> 231.0, 252.0, 94.0, 10.3
## $ Burkina.Faso                 <dbl> 25.0, 7.0, 7.0, 4.3
## $ Burundi                      <dbl> 88.0, 0.0, 0.0, 6.3
## $ Cote.d.Ivoire                <int> 37, 1, 7, 4
## $ Cabo.Verde                   <int> 144, 56, 16, 4
## $ Cambodia                     <dbl> 57.0, 65.0, 1.0, 2.2
## $ Cameroon                     <dbl> 147.0, 1.0, 4.0, 5.8
## $ Canada                       <dbl> 240.0, 122.0, 100.0, 8.2
## $ Central.African.Republic     <dbl> 17.0, 2.0, 1.0, 1.8
## $ Chad                         <dbl> 15.0, 1.0, 1.0, 0.4
## $ Chile                        <dbl> 130.0, 124.0, 172.0, 7.6
## $ China                        <int> 79, 192, 8, 5
## $ Colombia                     <dbl> 159.0, 76.0, 3.0, 4.2
## $ Comoros                      <dbl> 1.0, 3.0, 1.0, 0.1
## $ Congo                        <dbl> 76.0, 1.0, 9.0, 1.7
## $ Cook.Islands                 <dbl> 0.0, 254.0, 74.0, 5.9
## $ Costa.Rica                   <dbl> 149.0, 87.0, 11.0, 4.4
## $ Croatia                      <dbl> 230.0, 87.0, 254.0, 10.2
## $ Cuba                         <dbl> 93.0, 137.0, 5.0, 4.2
## $ Cyprus                       <dbl> 192.0, 154.0, 113.0, 8.2
## $ Czech.Republic               <dbl> 361.0, 170.0, 134.0, 11.8
## $ North.Korea                  <int> 0, 0, 0, 0
## $ DR.Congo                     <dbl> 32.0, 3.0, 1.0, 2.3
## $ Denmark                      <dbl> 224.0, 81.0, 278.0, 10.4
## $ Djibouti                     <dbl> 15.0, 44.0, 3.0, 1.1
## $ Dominica                     <dbl> 52.0, 286.0, 26.0, 6.6
## $ Dominican.Republic           <dbl> 193.0, 147.0, 9.0, 6.2
## $ Ecuador                      <dbl> 162.0, 74.0, 3.0, 4.2
## $ Egypt                        <dbl> 6.0, 4.0, 1.0, 0.2
## $ El.Salvador                  <dbl> 52.0, 69.0, 2.0, 2.2
## $ Equatorial.Guinea            <dbl> 92.0, 0.0, 233.0, 5.8
## $ Eritrea                      <dbl> 18.0, 0.0, 0.0, 0.5
## $ Estonia                      <dbl> 224.0, 194.0, 59.0, 9.5
## $ Ethiopia                     <dbl> 20.0, 3.0, 0.0, 0.7
## $ Fiji                         <int> 77, 35, 1, 2
## $ Finland                      <int> 263, 133, 97, 10
## $ France                       <dbl> 127.0, 151.0, 370.0, 11.8
## $ Gabon                        <dbl> 347.0, 98.0, 59.0, 8.9
## $ Gambia                       <dbl> 8.0, 0.0, 1.0, 2.4
## $ Georgia                      <dbl> 52.0, 100.0, 149.0, 5.4
## $ Germany                      <dbl> 346.0, 117.0, 175.0, 11.3
## $ Ghana                        <dbl> 31.0, 3.0, 10.0, 1.8
## $ Greece                       <dbl> 133.0, 112.0, 218.0, 8.3
## $ Grenada                      <dbl> 199.0, 438.0, 28.0, 11.9
## $ Guatemala                    <dbl> 53.0, 69.0, 2.0, 2.2
## $ Guinea                       <dbl> 9.0, 0.0, 2.0, 0.2
## $ Guinea.Bissau                <dbl> 28.0, 31.0, 21.0, 2.5
## $ Guyana                       <dbl> 93.0, 302.0, 1.0, 7.1
## $ Haiti                        <dbl> 1.0, 326.0, 1.0, 5.9
## $ Honduras                     <int> 69, 98, 2, 3
## $ Hungary                      <dbl> 234.0, 215.0, 185.0, 11.3
## $ Iceland                      <dbl> 233.0, 61.0, 78.0, 6.6
## $ India                        <dbl> 9.0, 114.0, 0.0, 2.2
## $ Indonesia                    <dbl> 5.0, 1.0, 0.0, 0.1
## $ Iran                         <int> 0, 0, 0, 0
## $ Iraq                         <dbl> 9.0, 3.0, 0.0, 0.2
## $ Ireland                      <dbl> 313.0, 118.0, 165.0, 11.4
## $ Israel                       <dbl> 63.0, 69.0, 9.0, 2.5
## $ Italy                        <dbl> 85.0, 42.0, 237.0, 6.5
## $ Jamaica                      <dbl> 82.0, 88.0, 9.0, 3.4
## $ Japan                        <int> 77, 202, 16, 7
## $ Jordan                       <dbl> 6.0, 21.0, 1.0, 0.5
## $ Kazakhstan                   <dbl> 124.0, 246.0, 12.0, 6.8
## $ Kenya                        <dbl> 58.0, 22.0, 2.0, 1.8
## $ Kiribati                     <int> 21, 34, 1, 1
## $ Kuwait                       <int> 0, 0, 0, 0
## $ Kyrgyzstan                   <dbl> 31.0, 88.0, 6.0, 2.4
## $ Laos                         <dbl> 62.0, 0.0, 123.0, 6.2
## $ Latvia                       <dbl> 281.0, 216.0, 62.0, 10.5
## $ Lebanon                      <dbl> 20.0, 55.0, 31.0, 1.9
## $ Lesotho                      <dbl> 82.0, 50.0, 0.0, 2.8
## $ Liberia                      <dbl> 19.0, 152.0, 2.0, 3.1
## $ Libya                        <int> 0, 0, 0, 0
## $ Lithuania                    <dbl> 343.0, 244.0, 56.0, 12.9
## $ Luxembourg                   <dbl> 236.0, 133.0, 271.0, 11.4
## $ Madagascar                   <dbl> 26.0, 15.0, 4.0, 0.8
## $ Malawi                       <dbl> 8.0, 11.0, 1.0, 1.5
## $ Malaysia                     <dbl> 13.0, 4.0, 0.0, 0.3
## $ Maldives                     <int> 0, 0, 0, 0
## $ Mali                         <dbl> 5.0, 1.0, 1.0, 0.6
## $ Malta                        <dbl> 149.0, 100.0, 120.0, 6.6
## $ Marshall.Islands             <int> 0, 0, 0, 0
## $ Mauritania                   <int> 0, 0, 0, 0
## $ Mauritius                    <dbl> 98.0, 31.0, 18.0, 2.6
## $ Mexico                       <dbl> 238.0, 68.0, 5.0, 5.5
## $ Micronesia                   <dbl> 62.0, 50.0, 18.0, 2.3
## $ Monaco                       <int> 0, 0, 0, 0
## $ Mongolia                     <dbl> 77.0, 189.0, 8.0, 4.9
## $ Montenegro                   <dbl> 31.0, 114.0, 128.0, 4.9
## $ Morocco                      <dbl> 12.0, 6.0, 10.0, 0.5
## $ Mozambique                   <dbl> 47.0, 18.0, 5.0, 1.3
## $ Myanmar                      <dbl> 5.0, 1.0, 0.0, 0.1
## $ Namibia                      <dbl> 376.0, 3.0, 1.0, 6.8
## $ Nauru                        <int> 49, 0, 8, 1
## $ Nepal                        <dbl> 5.0, 6.0, 0.0, 0.2
## $ Netherlands                  <dbl> 251.0, 88.0, 190.0, 9.4
## $ New.Zealand                  <dbl> 203.0, 79.0, 175.0, 9.3
## $ Nicaragua                    <dbl> 78.0, 118.0, 1.0, 3.5
## $ Niger                        <dbl> 3.0, 2.0, 1.0, 0.1
## $ Nigeria                      <dbl> 42.0, 5.0, 2.0, 9.1
## $ Niue                         <int> 188, 200, 7, 7
## $ Norway                       <dbl> 169.0, 71.0, 129.0, 6.7
## $ Oman                         <dbl> 22.0, 16.0, 1.0, 0.7
## $ Pakistan                     <int> 0, 0, 0, 0
## $ Palau                        <dbl> 306.0, 63.0, 23.0, 6.9
## $ Panama                       <dbl> 285.0, 104.0, 18.0, 7.2
## $ Papua.New.Guinea             <dbl> 44.0, 39.0, 1.0, 1.5
## $ Paraguay                     <dbl> 213.0, 117.0, 74.0, 7.3
## $ Peru                         <dbl> 163.0, 160.0, 21.0, 6.1
## $ Philippines                  <dbl> 71.0, 186.0, 1.0, 4.6
## $ Poland                       <dbl> 343.0, 215.0, 56.0, 10.9
## $ Portugal                     <int> 194, 67, 339, 11
## $ Qatar                        <dbl> 1.0, 42.0, 7.0, 0.9
## $ South.Korea                  <dbl> 140.0, 16.0, 9.0, 9.8
## $ Moldova                      <dbl> 109.0, 226.0, 18.0, 6.3
## $ Romania                      <dbl> 297.0, 122.0, 167.0, 10.4
## $ Russian.Federation           <dbl> 247.0, 326.0, 73.0, 11.5
## $ Rwanda                       <dbl> 43.0, 2.0, 0.0, 6.8
## $ St..Kitts...Nevis            <dbl> 194.0, 205.0, 32.0, 7.7
## $ St..Lucia                    <dbl> 171.0, 315.0, 71.0, 10.1
## $ St..Vincent...the.Grenadines <dbl> 120.0, 221.0, 11.0, 6.3
## $ Samoa                        <dbl> 105.0, 18.0, 24.0, 2.6
## $ San.Marino                   <int> 0, 0, 0, 0
## $ Sao.Tome...Principe          <dbl> 56.0, 38.0, 140.0, 4.2
## $ Saudi.Arabia                 <dbl> 0.0, 5.0, 0.0, 0.1
## $ Senegal                      <dbl> 9.0, 1.0, 7.0, 0.3
## $ Serbia                       <dbl> 283.0, 131.0, 127.0, 9.6
## $ Seychelles                   <dbl> 157.0, 25.0, 51.0, 4.1
## $ Sierra.Leone                 <dbl> 25.0, 3.0, 2.0, 6.7
## $ Singapore                    <dbl> 60.0, 12.0, 11.0, 1.5
## $ Slovakia                     <dbl> 196.0, 293.0, 116.0, 11.4
## $ Slovenia                     <dbl> 270.0, 51.0, 276.0, 10.6
## $ Solomon.Islands              <dbl> 56.0, 11.0, 1.0, 1.2
## $ Somalia                      <int> 0, 0, 0, 0
## $ South.Africa                 <dbl> 225.0, 76.0, 81.0, 8.2
## $ Spain                        <int> 284, 157, 112, 10
## $ Sri.Lanka                    <dbl> 16.0, 104.0, 0.0, 2.2
## $ Sudan                        <dbl> 8.0, 13.0, 0.0, 1.7
## $ Suriname                     <dbl> 128.0, 178.0, 7.0, 5.6
## $ Swaziland                    <dbl> 90.0, 2.0, 2.0, 4.7
## $ Sweden                       <dbl> 152.0, 60.0, 186.0, 7.2
## $ Switzerland                  <dbl> 185.0, 100.0, 280.0, 10.2
## $ Syria                        <int> 5, 35, 16, 1
## $ Tajikistan                   <dbl> 2.0, 15.0, 0.0, 0.3
## $ Thailand                     <dbl> 99.0, 258.0, 1.0, 6.4
## $ Macedonia                    <dbl> 106.0, 27.0, 86.0, 3.9
## $ Timor.Leste                  <dbl> 1.0, 1.0, 4.0, 0.1
## $ Togo                         <dbl> 36.0, 2.0, 19.0, 1.3
## $ Tonga                        <dbl> 36.0, 21.0, 5.0, 1.1
## $ Trinidad...Tobago            <dbl> 197.0, 156.0, 7.0, 6.4
## $ Tunisia                      <dbl> 51.0, 3.0, 20.0, 1.3
## $ Turkey                       <dbl> 51.0, 22.0, 7.0, 1.4
## $ Turkmenistan                 <dbl> 19.0, 71.0, 32.0, 2.2
## $ Tuvalu                       <int> 6, 41, 9, 1
## $ Uganda                       <dbl> 45.0, 9.0, 0.0, 8.3
## $ Ukraine                      <dbl> 206.0, 237.0, 45.0, 8.9
## $ United.Arab.Emirates         <dbl> 16.0, 135.0, 5.0, 2.8
## $ United.Kingdom               <dbl> 219.0, 126.0, 195.0, 10.4
## $ Tanzania                     <dbl> 36.0, 6.0, 1.0, 5.7
## $ USA                          <dbl> 249.0, 158.0, 84.0, 8.7
## $ Uruguay                      <dbl> 115.0, 35.0, 220.0, 6.6
## $ Uzbekistan                   <dbl> 25.0, 101.0, 8.0, 2.4
## $ Vanuatu                      <dbl> 21.0, 18.0, 11.0, 0.9
## $ Venezuela                    <dbl> 333.0, 100.0, 3.0, 7.7
## $ Vietnam                      <int> 111, 2, 1, 2
## $ Yemen                        <dbl> 6.0, 0.0, 0.0, 0.1
## $ Zambia                       <dbl> 32.0, 19.0, 4.0, 2.5
## $ Zimbabwe                     <dbl> 64.0, 18.0, 4.0, 4.7

library(reshape)

## 
## 다음의 패키지를 부착합니다: 'reshape'

## The following object is masked from 'package:dplyr':
## 
##     rename

df1<-melt(df,id="year")
df1 %>% glimpse()

## Rows: 772
## Columns: 3
## $ year     <int> 1999, 2000, 2001, 2002, 1999, 2000, 2001, 2002, 1999, 2000, 2…
## $ variable <fct> Afghanistan, Afghanistan, Afghanistan, Afghanistan, Albania, …
## $ value    <dbl> 0.0, 0.0, 0.0, 0.0, 89.0, 132.0, 54.0, 4.9, 25.0, 0.0, 14.0, …

colSums(is.na(df1))

##     year variable    value 
##        0        0        0

names(df1)[2:3]<-c("country","disease")
names(df1)

## [1] "year"    "country" "disease"

df1 %>% filter(year==2000) %>% summarise(m=mean(disease))

##          m
## 1 81.01036

df1 %>% filter(year==2000) %>% filter(disease>81.01036) %>% NROW->result
print(result)

## [1] 76

library(dplyr)
library(caret)

## 필요한 패키지를 로딩중입니다: ggplot2

## 필요한 패키지를 로딩중입니다: lattice

library(recipes)

## 
## 다음의 패키지를 부착합니다: 'recipes'

## The following object is masked from 'package:stats':
## 
##     step

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## 다음의 패키지를 부착합니다: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

x_test<-read.csv("X_test.csv",fileEncoding = "euc-kr")
x_train<-read.csv('X_train.csv',fileEncoding = "euc-kr")
y_train<-read.csv('y_train.csv',fileEncoding = "euc-kr")
x_train %>% glimpse

## Rows: 3,500
## Columns: 10
## $ cust_id        <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
## $ 총구매액       <dbl> 68282840, 2136000, 3197000, 16077620, 29050000, 1137900…
## $ 최대구매액     <int> 11264000, 2136000, 1639000, 4935000, 24000000, 9552000,…
## $ 환불금액       <int> 6860000, 300000, NA, NA, NA, 462000, 4582000, 29524000,…
## $ 주구매상품     <chr> "기타", "스포츠", "남성 캐주얼", "기타", "보석", "디자…
## $ 주구매지점     <chr> "강남점", "잠실점", "관악점", "광주점", "본  점", "일산…
## $ 내점일수       <int> 19, 2, 2, 18, 2, 3, 5, 63, 18, 1, 25, 3, 2, 27, 84, 152…
## $ 내점당구매건수 <dbl> 3.894737, 1.500000, 2.000000, 2.444444, 1.500000, 1.666…
## $ 주말방문비율   <dbl> 0.52702703, 0.00000000, 0.00000000, 0.31818182, 0.00000…
## $ 구매주기       <int> 17, 1, 1, 16, 85, 42, 42, 5, 15, 0, 13, 89, 16, 10, 4, …

y_train %>% glimpse

## Rows: 3,500
## Columns: 2
## $ cust_id <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
## $ gender  <int> 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1,…

left_join(x_train,y_train,by='cust_id') %>% mutate(index='train')->train
train %>% glimpse

## Rows: 3,500
## Columns: 12
## $ cust_id        <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
## $ 총구매액       <dbl> 68282840, 2136000, 3197000, 16077620, 29050000, 1137900…
## $ 최대구매액     <int> 11264000, 2136000, 1639000, 4935000, 24000000, 9552000,…
## $ 환불금액       <int> 6860000, 300000, NA, NA, NA, 462000, 4582000, 29524000,…
## $ 주구매상품     <chr> "기타", "스포츠", "남성 캐주얼", "기타", "보석", "디자…
## $ 주구매지점     <chr> "강남점", "잠실점", "관악점", "광주점", "본  점", "일산…
## $ 내점일수       <int> 19, 2, 2, 18, 2, 3, 5, 63, 18, 1, 25, 3, 2, 27, 84, 152…
## $ 내점당구매건수 <dbl> 3.894737, 1.500000, 2.000000, 2.444444, 1.500000, 1.666…
## $ 주말방문비율   <dbl> 0.52702703, 0.00000000, 0.00000000, 0.31818182, 0.00000…
## $ 구매주기       <int> 17, 1, 1, 16, 85, 42, 42, 5, 15, 0, 13, 89, 16, 10, 4, …
## $ gender         <int> 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0…
## $ index          <chr> "train", "train", "train", "train", "train", "train", "…

x_test %>% mutate(index='test')->test
test %>% glimpse

## Rows: 2,482
## Columns: 11
## $ cust_id        <int> 3500, 3501, 3502, 3503, 3504, 3505, 3506, 3507, 3508, 3…
## $ 총구매액       <dbl> 70900400, 310533100, 305264140, 7594080, 1795790, 13000…
## $ 최대구매액     <int> 22000000, 38558000, 14825000, 5225000, 1411200, 2160000…
## $ 환불금액       <int> 4050000, 48034700, 30521000, NA, NA, NA, 39566000, NA, …
## $ 주구매상품     <chr> "골프", "농산물", "가공식품", "주방용품", "수산품", "화…
## $ 주구매지점     <chr> "부산본점", "잠실점", "본  점", "부산본점", "청량리점",…
## $ 내점일수       <int> 13, 90, 101, 5, 3, 5, 144, 1, 1, 28, 21, 3, 23, 30, 3, …
## $ 내점당구매건수 <dbl> 1.461538, 2.433333, 14.623762, 2.000000, 2.666667, 2.20…
## $ 주말방문비율   <dbl> 0.78947368, 0.36986301, 0.08327691, 0.00000000, 0.12500…
## $ 구매주기       <int> 26, 3, 3, 47, 8, 61, 2, 0, 0, 12, 14, 2, 15, 11, 112, 2…
## $ index          <chr> "test", "test", "test", "test", "test", "test", "test",…

bind_rows(train,test)->full
full$gender<-ifelse(full$gender==0,"남성","여성")
full$gender<-as.factor(full$gender)
full$index<-as.factor(full$index)
names(full)

##  [1] "cust_id"        "총구매액"       "최대구매액"     "환불금액"      
##  [5] "주구매상품"     "주구매지점"     "내점일수"       "내점당구매건수"
##  [9] "주말방문비율"   "구매주기"       "gender"         "index"

select<-dplyr::select
rename<-dplyr::rename
data<-full %>% rename(total="총구매액",
                      max="최대구매액",
                      refund="환불금액",
                      product="주구매상품",
                      store="주구매지점",
                      day="내점일수",
                      count= "내점당구매건수",
                      week="주말방문비율",
                      cycle="구매주기") %>% 
  select(cust_id,index,gender,total,max,refund,product,store,day,count,
         week,cycle)
data %>% glimpse()

## Rows: 5,982
## Columns: 12
## $ cust_id <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
## $ index   <fct> train, train, train, train, train, train, train, train, train,…
## $ gender  <fct> 남성, 남성, 여성, 여성, 남성, 남성, 남성, 남성, 남성, 여성, 남…
## $ total   <dbl> 68282840, 2136000, 3197000, 16077620, 29050000, 11379000, 1005…
## $ max     <int> 11264000, 2136000, 1639000, 4935000, 24000000, 9552000, 761200…
## $ refund  <int> 6860000, 300000, NA, NA, NA, 462000, 4582000, 29524000, NA, NA…
## $ product <chr> "기타", "스포츠", "남성 캐주얼", "기타", "보석", "디자이너", "…
## $ store   <chr> "강남점", "잠실점", "관악점", "광주점", "본  점", "일산점", "…
## $ day     <int> 19, 2, 2, 18, 2, 3, 5, 63, 18, 1, 25, 3, 2, 27, 84, 152, 26, 2…
## $ count   <dbl> 3.894737, 1.500000, 2.000000, 2.444444, 1.500000, 1.666667, 2.…
## $ week    <dbl> 0.52702703, 0.00000000, 0.00000000, 0.31818182, 0.00000000, 0.…
## $ cycle   <int> 17, 1, 1, 16, 85, 42, 42, 5, 15, 0, 13, 89, 16, 10, 4, 2, 13, …

colSums(is.na(data))

## cust_id   index  gender   total     max  refund product   store     day   count 
##       0       0    2482       0       0    3906       0       0       0       0 
##    week   cycle 
##       0       0

data$refund<-ifelse(is.na(data$refund),0,data$refund)
colSums(is.na(data))

## cust_id   index  gender   total     max  refund product   store     day   count 
##       0       0    2482       0       0       0       0       0       0       0 
##    week   cycle 
##       0       0

recipe(gender~.,data = data) %>% 
  step_YeoJohnson(total,max,refund,day,count,week,cycle) %>% 
  step_scale(total,max,refund,day,count,week,cycle) %>% 
  step_center(total,max,refund,day,count,week,cycle) %>% 
  prep() %>% juice()->data1
data1 %>% glimpse

## Rows: 5,982
## Columns: 12
## $ cust_id <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
## $ index   <fct> train, train, train, train, train, train, train, train, train,…
## $ total   <dbl> -0.1109616, -0.5964776, -0.5864340, -0.4794213, -0.3820895, -0…
## $ max     <dbl> -0.25879992, -0.58005471, -0.59931645, -0.47681619, 0.15253819…
## $ refund  <dbl> 1.3776676, 1.2130535, -0.7281455, -0.7281455, -0.7281455, 1.23…
## $ product <fct> 기타, 스포츠, 남성 캐주얼, 기타, 보석, 디자이너, 시티웨어, 명…
## $ store   <fct> 강남점, 잠실점, 관악점, 광주점, 본  점, 일산점, 강남점, 본  점…
## $ day     <dbl> 0.6267964, -0.9872986, -0.9872986, 0.5877041, -0.9872986, -0.7…
## $ count   <dbl> 0.92059492, -0.89611526, -0.32407144, 0.06726813, -0.89611526,…
## $ week    <dbl> 0.96145636, -1.31805060, -1.31805060, 0.32838074, -1.31805060,…
## $ cycle   <dbl> 0.28905563, -1.19528222, -1.19528222, 0.24219137, 1.78728765, …
## $ gender  <fct> 남성, 남성, 여성, 여성, 남성, 남성, 남성, 남성, 남성, 여성, 남…

data1 %>% filter(index=='train') %>% select(-index)->train
data1 %>% filter(index=='test') %>% select(-index)->test

ctrl<-trainControl(method = 'cv',number = 10,
                   summaryFunction = twoClassSummary,
                   classProbs = TRUE)
train(gender~.,data=train,
      method='rpart',
      metric='ROC',
      trControl=ctrl)->rlift
train(gender~.,data=train,
      method='glm',family=binomial,
      metric='ROC',
      trControl=ctrl)->rlift1

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

rlift

## CART 
## 
## 3500 samples
##   10 predictor
##    2 classes: '남성', '여성' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 3149, 3149, 3150, 3149, 3149, 3151, ... 
## Resampling results across tuning parameters:
## 
##   cp           ROC        Sens       Spec     
##   0.005319149  0.6333155  0.8054397  0.3685866
##   0.006838906  0.6274718  0.8013154  0.3601145
##   0.007598784  0.6273554  0.7948934  0.3699630
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.005319149.

predict(rlift,test,type = 'prob')->pred_fit1
head(pred_fit1)

##        남성      여성
## 1 0.7364290 0.2635710
## 2 0.7364290 0.2635710
## 3 0.7364290 0.2635710
## 4 0.4471698 0.5528302
## 5 0.4471698 0.5528302
## 6 0.6927711 0.3072289

names(pred_fit1)[1]<-'gender'
head(pred_fit1)

##      gender      여성
## 1 0.7364290 0.2635710
## 2 0.7364290 0.2635710
## 3 0.7364290 0.2635710
## 4 0.4471698 0.5528302
## 5 0.4471698 0.5528302
## 6 0.6927711 0.3072289

bind_cols(x_test,pred_fit1) %>% select(cust_id,gender)->df
head(df)

##   cust_id    gender
## 1    3500 0.7364290
## 2    3501 0.7364290
## 3    3502 0.7364290
## 4    3503 0.4471698
## 5    3504 0.4471698
## 6    3505 0.6927711

write.csv(df,"1818017.csv",row.names = FALSE)
read.csv('1818017.csv') %>% head

##   cust_id    gender
## 1    3500 0.7364290
## 2    3501 0.7364290
## 3    3502 0.7364290
## 4    3503 0.4471698
## 5    3504 0.4471698
## 6    3505 0.6927711

bigdata_05

noh hyeon uk

2023-01-04