1 Introduction

In this paper, we are going to do text classification. Our problem is to determine which cuisine a recipe belongs to.

1.1 Libraries

library(data.table) # library for fast data processing
library(jsonlite) # library for reading data from json format file.
library(tm) # library to do data engineering with text
library(caret) # library for splitting data into train and test dataset
library(ggplot2) # library for data visualization
library(xgboost) # library for XGBoost
library(Matrix) # library used to work with sparse matrices
# library(Metrics)

1.2 Data import

To evaluate our model, we are going to use data from train.csv file as a our dataset.

setwd("~/Dropbox/RProjects/datasets/text_class/train.json")
tr <- fromJSON("train.json")

2 Data Engineering

We divide our dataset into train and test datasets.

index <- createDataPartition(tr$cuisine,p=0.7,list = F)
train <- tr[index,]
test <- tr[-index,]
rm(tr) # removing variable we no longer need
gc(reset = TRUE) # empty the memory from unused objects
          used  (Mb) gc trigger  (Mb) max used  (Mb)
Ncells 2013437 107.6    5547317 296.3  2013437 107.6
Vcells 4054861  31.0   10146329  77.5  4054861  31.0

Let’s take a peek at the datasets

head(train)
     id     cuisine
2 25693 southern_us
3 20130    filipino
4 22213      indian
5 13162      indian
6  6602    jamaican
7 42779     spanish
                                                                                                                                                                                                                                       ingredients
2                                                                                                              plain flour, ground pepper, salt, tomatoes, ground black pepper, thyme, eggs, green tomatoes, yellow corn meal, milk, vegetable oil
3                                                                                               eggs, pepper, salt, mayonaise, cooking oil, green chilies, grilled chicken breasts, garlic powder, yellow onion, soy sauce, butter, chicken livers
4                                                                                                                                                                                                                water, vegetable oil, wheat, salt
5 black pepper, shallots, cornflour, cayenne pepper, onions, garlic paste, milk, butter, salt, lemon juice, water, chili powder, passata, oil, ground cumin, boneless chicken skinless thigh, garam masala, double cream, natural yogurt, bay leaf
6                                                                                                  plain flour, sugar, butter, eggs, fresh ginger root, salt, ground cinnamon, milk, vanilla extract, ground ginger, powdered sugar, baking powder
7                                                                           olive oil, salt, medium shrimp, pepper, garlic, chopped cilantro, jalapeno chilies, flat leaf parsley, skirt steak, white vinegar, sea salt, bay leaf, chorizo sausage
head(test)
      id cuisine
1  10259   greek
8   3735 italian
9  16903 mexican
10 12734 italian
11  5875 italian
13  2698 italian
                                                                                                                                                                                   ingredients
1                                                                 romaine lettuce, black olives, grape tomatoes, garlic, pepper, purple onion, seasoning, garbanzo beans, feta cheese crumbles
8                                                          sugar, pistachio nuts, white almond bark, flour, vanilla extract, olive oil, almond extract, eggs, baking powder, dried cranberries
9  olive oil, purple onion, fresh pineapple, pork, poblano peppers, corn tortillas, cheddar cheese, ground black pepper, salt, iceberg lettuce, lime, jalapeno chilies, chopped cilantro fresh
10                                                                                               chopped tomatoes, fresh basil, garlic, extra-virgin olive oil, kosher salt, flat leaf parsley
11                       pimentos, sweet pepper, dried oregano, olive oil, garlic, sharp cheddar cheese, pepper, swiss cheese, provolone cheese, canola oil, mushrooms, black olives, sausages
13            Italian parsley leaves, walnuts, hot red pepper flakes, extra-virgin olive oil, fresh lemon juice, trout fillet, garlic cloves, chipotle chile, fine sea salt, flat leaf parsley

After we have splitted the datasets, we combine than with rbind. That way we can easly split them again later.

combi <- rbind(train, test)

We create corpus for our recipes and do some data engineering.

corpus <- Corpus(VectorSource(combi$ingredients))
corpus <- tm_map(corpus, tolower) # converting all text to lower case register
Warning in tm_map.SimpleCorpus(corpus, tolower): transformation drops
documents
corpus[[1]]
<<PlainTextDocument>>
Metadata:  7
Content:  chars: 156
corpus <- tm_map(corpus, removePunctuation) # remove punctuation
Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation
drops documents
corpus[[1]]
<<PlainTextDocument>>
Metadata:  7
Content:  chars: 122
corpus <- tm_map(corpus, removeWords, c(stopwords('english'))) # if present, remove words in English that does not characterize the recipes, such as I, we , me and etc.  
Warning in tm_map.SimpleCorpus(corpus, removeWords,
c(stopwords("english"))): transformation drops documents
corpus[[1]]
<<PlainTextDocument>>
Metadata:  7
Content:  chars: 122
corpus <- tm_map(corpus, stripWhitespace) # remove whitespaces
Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation
drops documents
corpus[[1]]
<<PlainTextDocument>>
Metadata:  7
Content:  chars: 122
corpus <- tm_map(corpus, stemDocument) # for all the words that have a common root, return the recognizable root word. For example, for words "computational", "computers" and "computation", returns "computer" and not "comput"
Warning in tm_map.SimpleCorpus(corpus, stemDocument): transformation drops
documents
corpus[[1]]
<<PlainTextDocument>>
Metadata:  7
Content:  chars: 113
head(corpus) #document matrix
<<SimpleCorpus>>
Metadata:  corpus specific: 1, document level (indexed): 0
Content:  documents: 6

We create a term-document matrix that shows how many times a particular term has occured in a text document (recipe).

frequencies <- DocumentTermMatrix(corpus)
frequencies
<<DocumentTermMatrix (documents: 39774, terms: 3800)>>
Non-/sparse entries: 745439/150395761
Sparsity           : 100%
Maximal term length: 19
Weighting          : term frequency (tf)

Organizing frequency of terms.

freq <- colSums(as.matrix(frequencies)) # forms row and column sums 
length(freq) # number of unique ingredient words (terms)
[1] 3800
ord <- order(freq) # order terms by number of occurrences in recipes
ord
   [1]  210  313  618  712  718  846 1019 1064 1070 1076 1099 1117 1164
  [14] 1260 1336 1361 1380 1407 1421 1424 1467 1479 1486 1519 1554 1566
  [27] 1577 1579 1599 1612 1620 1629 1631 1632 1633 1634 1647 1661 1670
  [40] 1672 1673 1690 1696 1709 1717 1735 1737 1761 1780 1792 1807 1811
  [53] 1826 1831 1849 1852 1855 1893 1894 1911 1921 1933 1934 1935 1949
  [66] 1950 1964 1968 1978 1988 2004 2016 2022 2024 2034 2036 2048 2052
  [79] 2053 2058 2063 2071 2078 2087 2103 2108 2111 2134 2137 2151 2169
  [92] 2174 2189 2204 2209 2210 2211 2222 2225 2239 2242 2243 2244 2247
 [105] 2248 2252 2256 2257 2258 2274 2285 2286 2290 2291 2298 2299 2329
 [118] 2352 2358 2362 2370 2371 2372 2377 2382 2386 2387 2393 2398 2399
 [131] 2400 2403 2404 2405 2407 2411 2412 2414 2416 2426 2432 2441 2444
 [144] 2450 2461 2462 2464 2473 2474 2481 2482 2484 2486 2488 2496 2498
 [157] 2507 2508 2514 2522 2524 2525 2533 2534 2538 2539 2549 2552 2553
 [170] 2556 2562 2570 2575 2577 2585 2591 2596 2600 2605 2608 2615 2618
 [183] 2621 2623 2628 2630 2632 2633 2637 2639 2640 2641 2643 2649 2651
 [196] 2658 2659 2660 2666 2668 2669 2673 2679 2680 2685 2686 2688 2691
 [209] 2696 2702 2705 2713 2714 2715 2721 2723 2724 2725 2729 2733 2734
 [222] 2741 2742 2757 2758 2759 2761 2764 2766 2767 2768 2774 2786 2788
 [235] 2790 2793 2796 2798 2801 2803 2809 2812 2813 2814 2815 2816 2818
 [248] 2820 2824 2826 2830 2832 2839 2840 2844 2846 2853 2856 2862 2866
 [261] 2870 2874 2875 2878 2883 2886 2887 2889 2893 2897 2898 2901 2903
 [274] 2904 2906 2907 2910 2912 2913 2914 2916 2923 2926 2927 2930 2931
 [287] 2932 2934 2936 2939 2941 2943 2947 2948 2949 2950 2951 2954 2956
 [300] 2959 2960 2961 2964 2970 2972 2973 2974 2976 2978 2982 2985 2991
 [313] 2999 3002 3005 3007 3009 3010 3011 3012 3014 3015 3019 3023 3026
 [326] 3028 3030 3031 3035 3036 3038 3039 3041 3044 3045 3046 3048 3049
 [339] 3050 3052 3058 3059 3061 3062 3069 3071 3074 3078 3079 3081 3082
 [352] 3083 3084 3086 3087 3089 3090 3091 3092 3096 3098 3099 3106 3111
 [365] 3117 3120 3121 3122 3125 3128 3131 3133 3134 3135 3136 3138 3139
 [378] 3142 3144 3145 3150 3151 3152 3153 3154 3158 3160 3164 3166 3167
 [391] 3168 3170 3172 3173 3178 3181 3182 3184 3185 3186 3187 3188 3191
 [404] 3193 3195 3196 3197 3199 3202 3208 3209 3214 3216 3217 3218 3220
 [417] 3221 3222 3224 3228 3229 3231 3233 3236 3239 3241 3244 3245 3247
 [430] 3250 3252 3254 3256 3257 3259 3260 3261 3262 3263 3264 3265 3269
 [443] 3271 3272 3273 3274 3276 3280 3282 3283 3284 3286 3290 3292 3293
 [456] 3295 3297 3298 3300 3302 3305 3308 3316 3318 3321 3322 3323 3324
 [469] 3325 3327 3328 3329 3331 3332 3333 3334 3336 3338 3339 3341 3342
 [482] 3345 3347 3348 3349 3350 3354 3357 3359 3360 3361 3362 3364 3368
 [495] 3369 3370 3371 3373 3374 3376 3377 3378 3379 3380 3384 3385 3386
 [508] 3388 3389 3390 3391 3392 3394 3395 3397 3398 3400 3402 3403 3404
 [521] 3405 3406 3407 3408 3409 3410 3411 3413 3414 3415 3417 3418 3419
 [534] 3423 3424 3425 3426 3428 3429 3430 3431 3432 3433 3435 3436 3437
 [547] 3438 3439 3440 3441 3442 3444 3445 3446 3447 3448 3449 3450 3451
 [560] 3453 3456 3458 3459 3460 3461 3463 3464 3465 3466 3467 3469 3470
 [573] 3471 3472 3474 3475 3476 3477 3479 3480 3481 3482 3483 3484 3485
 [586] 3486 3488 3489 3490 3491 3492 3494 3495 3497 3498 3499 3500 3501
 [599] 3502 3504 3505 3506 3507 3508 3509 3510 3512 3514 3515 3517 3518
 [612] 3520 3522 3523 3524 3525 3526 3528 3529 3530 3531 3532 3533 3535
 [625] 3537 3538 3539 3540 3541 3542 3543 3544 3545 3547 3548 3549 3550
 [638] 3551 3552 3554 3555 3556 3557 3558 3559 3560 3561 3563 3564 3565
 [651] 3566 3568 3569 3570 3571 3572 3573 3574 3575 3576 3578 3579 3581
 [664] 3583 3584 3585 3586 3587 3588 3589 3591 3592 3593 3594 3595 3596
 [677] 3598 3599 3600 3601 3602 3604 3605 3606 3607 3608 3609 3610 3611
 [690] 3612 3613 3614 3615 3617 3618 3619 3621 3622 3623 3624 3625 3626
 [703] 3627 3628 3629 3630 3631 3632 3634 3636 3638 3639 3640 3641 3643
 [716] 3644 3647 3649 3650 3652 3653 3654 3655 3656 3657 3658 3659 3660
 [729] 3661 3662 3664 3667 3668 3669 3670 3673 3674 3675 3676 3677 3678
 [742] 3679 3680 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692
 [755] 3693 3694 3695 3696 3697 3699 3700 3701 3703 3704 3705 3706 3707
 [768] 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722
 [781] 3723 3724 3725 3726 3728 3729 3730 3731 3732 3733 3734 3735 3736
 [794] 3737 3738 3739 3740 3741 3742 3743 3744 3745 3747 3748 3749 3750
 [807] 3751 3752 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764
 [820] 3765 3766 3767 3768 3769 3770 3771 3772 3774 3775 3776 3777 3778
 [833] 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791
 [846] 3792 3793 3794 3795 3796 3797 3798 3799 3800  450  570  655  671
 [859]  953  992  996 1024 1048 1083 1094 1118 1140 1169 1262 1263 1300
 [872] 1308 1378 1387 1388 1401 1418 1429 1430 1431 1445 1513 1516 1521
 [885] 1532 1534 1557 1565 1586 1607 1648 1684 1698 1775 1777 1781 1784
 [898] 1785 1814 1828 1858 1862 1897 1905 1906 1908 1926 1936 1952 1957
 [911] 1958 1960 1980 1982 1995 2020 2035 2041 2042 2050 2055 2056 2067
 [924] 2069 2094 2105 2106 2115 2119 2120 2128 2139 2156 2158 2164 2167
 [937] 2171 2186 2193 2199 2213 2230 2235 2259 2268 2277 2278 2279 2281
 [950] 2293 2294 2300 2302 2307 2310 2318 2323 2326 2330 2337 2340 2351
 [963] 2356 2364 2380 2381 2394 2395 2402 2419 2423 2425 2428 2431 2438
 [976] 2440 2446 2451 2456 2457 2469 2476 2479 2480 2500 2504 2510 2515
 [989] 2528 2535 2536 2541 2543 2550 2555 2561 2569 2573 2588 2590 2592
[1002] 2598 2599 2603 2606 2619 2620 2624 2626 2627 2638 2642 2646 2650
[1015] 2663 2664 2665 2670 2678 2681 2687 2694 2698 2703 2704 2706 2709
[1028] 2710 2712 2716 2718 2727 2728 2730 2735 2737 2738 2744 2746 2747
[1041] 2751 2770 2775 2778 2782 2785 2787 2792 2795 2802 2806 2819 2822
[1054] 2827 2829 2834 2835 2837 2838 2841 2842 2845 2848 2850 2851 2855
[1067] 2857 2867 2868 2873 2876 2877 2881 2882 2884 2891 2896 2902 2908
[1080] 2915 2918 2919 2921 2922 2924 2925 2928 2935 2945 2955 2965 2966
[1093] 2967 2971 2975 2980 2981 2987 2988 2989 2990 2993 2994 2995 2997
[1106] 3001 3004 3006 3008 3013 3016 3017 3018 3020 3021 3022 3024 3027
[1119] 3029 3037 3043 3053 3054 3055 3057 3063 3064 3072 3073 3076 3077
[1132] 3080 3085 3088 3094 3097 3100 3101 3102 3104 3105 3107 3112 3113
[1145] 3115 3126 3129 3130 3146 3149 3156 3157 3169 3171 3174 3175 3176
[1158] 3179 3180 3183 3190 3198 3206 3207 3210 3211 3215 3226 3230 3235
[1171] 3237 3242 3246 3255 3266 3267 3268 3270 3277 3281 3285 3287 3288
[1184] 3289 3291 3299 3303 3304 3309 3312 3313 3314 3330 3337 3344 3346
[1197] 3351 3352 3353 3355 3356 3358 3363 3367 3372 3375 3381 3383 3387
[1210] 3393 3399 3412 3420 3422 3427 3452 3454 3457 3468 3473 3478 3487
[1223] 3493 3496 3503 3511 3513 3516 3519 3521 3527 3536 3546 3553 3567
[1236] 3577 3582 3590 3597 3603 3616 3620 3633 3635 3637 3642 3645 3646
[1249] 3663 3665 3666 3671 3672 3681 3698 3702 3708 3709 3753 3773  352
[1262]  602  679  697  711  813  822  849  888  907  934  998 1008 1074
[1275] 1113 1180 1200 1201 1206 1259 1319 1325 1444 1446 1447 1473 1496
[1288] 1528 1558 1572 1593 1619 1623 1625 1636 1638 1656 1667 1669 1688
[1301] 1691 1692 1695 1723 1765 1771 1772 1786 1788 1793 1798 1824 1840
[1314] 1847 1864 1879 1885 1937 1943 1951 1953 1959 1975 1991 2002 2003
[1327] 2010 2015 2021 2030 2037 2047 2080 2088 2122 2124 2126 2130 2131
[1340] 2133 2142 2150 2153 2168 2173 2179 2192 2206 2214 2216 2231 2232
[1353] 2241 2246 2253 2254 2255 2265 2288 2292 2296 2297 2316 2319 2324
[1366] 2333 2334 2343 2347 2349 2353 2363 2367 2376 2378 2379 2401 2413
[1379] 2421 2422 2427 2449 2453 2455 2458 2465 2478 2501 2502 2530 2542
[1392] 2544 2545 2557 2558 2572 2574 2587 2589 2593 2602 2612 2616 2629
[1405] 2656 2662 2671 2675 2677 2683 2684 2692 2700 2720 2739 2740 2745
[1418] 2748 2753 2756 2760 2769 2777 2784 2791 2794 2799 2811 2821 2833
[1431] 2843 2847 2849 2854 2859 2880 2888 2892 2894 2895 2899 2900 2911
[1444] 2917 2920 2933 2940 2942 2946 2957 2958 2963 2969 2977 2979 2983
[1457] 2984 2986 2992 3003 3032 3033 3034 3040 3042 3047 3066 3095 3109
[1470] 3114 3118 3119 3123 3124 3137 3140 3143 3147 3155 3159 3163 3165
[1483] 3189 3194 3205 3212 3213 3219 3225 3240 3243 3253 3275 3279 3296
[1496] 3301 3306 3307 3315 3319 3320 3340 3365 3366 3382 3396 3416 3421
[1509] 3434 3455 3562 3648 3651 3727 3746  419  494  521  604  851  989
[1522]  991 1009 1068 1116 1207 1220 1244 1320 1337 1343 1359 1374 1403
[1535] 1428 1461 1488 1503 1508 1520 1560 1598 1617 1621 1668 1697 1715
[1548] 1768 1806 1815 1817 1850 1874 1884 1914 1941 1946 1955 1956 1976
[1561] 1981 1987 1990 2017 2033 2046 2066 2081 2083 2096 2121 2138 2147
[1574] 2155 2175 2180 2202 2208 2212 2215 2218 2220 2261 2269 2305 2309
[1587] 2315 2327 2338 2345 2359 2374 2385 2389 2390 2410 2417 2420 2424
[1600] 2436 2437 2439 2454 2460 2470 2483 2490 2491 2492 2494 2499 2511
[1613] 2526 2532 2540 2546 2554 2578 2581 2583 2597 2617 2644 2645 2648
[1626] 2653 2657 2667 2674 2676 2707 2731 2754 2762 2771 2772 2773 2779
[1639] 2781 2789 2797 2808 2810 2817 2823 2831 2836 2858 2861 2869 2879
[1652] 2885 2929 2952 2962 2968 3000 3051 3056 3070 3075 3093 3116 3132
[1665] 3161 3162 3201 3223 3227 3234 3238 3248 3249 3251 3278 3294 3311
[1678] 3343 3401 3443 3462 3580  577  696  698  699  719  738  758  824
[1691] 1034 1042 1071 1104 1114 1115 1158 1208 1213 1245 1312 1313 1348
[1704] 1423 1457 1460 1518 1525 1535 1576 1606 1640 1651 1664 1676 1699
[1717] 1704 1739 1755 1762 1770 1794 1821 1842 1871 1886 1912 1954 1970
[1730] 1973 1992 2000 2001 2118 2125 2170 2176 2182 2188 2197 2207 2223
[1743] 2226 2227 2249 2251 2260 2262 2266 2276 2303 2336 2342 2365 2383
[1756] 2388 2406 2430 2435 2447 2467 2468 2472 2505 2568 2576 2580 2584
[1769] 2586 2594 2595 2604 2610 2611 2613 2622 2634 2635 2652 2655 2661
[1782] 2690 2693 2695 2701 2711 2722 2800 2860 2863 2871 2872 2905 2937
[1795] 2938 3110 3200 3203 3204 3317  424  563  564  635  801  812  865
[1808]  902  930 1004 1043 1063 1105 1130 1141 1204 1255 1272 1293 1301
[1821] 1307 1311 1389 1449 1450 1456 1469 1480 1650 1654 1657 1674 1719
[1834] 1721 1796 1808 1812 1816 1833 1856 1857 1861 1887 1929 1940 1945
[1847] 1948 2009 2019 2061 2062 2064 2074 2076 2082 2089 2098 2099 2102
[1860] 2127 2132 2140 2141 2152 2154 2157 2161 2195 2201 2224 2228 2283
[1873] 2322 2397 2442 2459 2503 2513 2520 2521 2523 2547 2563 2565 2567
[1886] 2601 2614 2625 2682 2699 2736 2743 2755 2763 2852 2890 2996 3065
[1899] 3067 3127 3141 3148 3177 3192 3326 3534  354  420  447  488  677
[1912]  694  700  702  762  935  936  967 1000 1017 1049 1087 1135 1193
[1925] 1269 1326 1334 1344 1363 1395 1427 1435 1436 1451 1466 1472 1475
[1938] 1512 1530 1589 1659 1663 1710 1727 1728 1746 1752 1782 1789 1790
[1951] 1825 1827 1846 1860 1900 1902 1903 1920 1928 1930 1965 1998 2032
[1964] 2043 2057 2065 2075 2086 2093 2116 2136 2148 2149 2160 2166 2177
[1977] 2198 2219 2240 2264 2304 2332 2350 2369 2448 2509 2518 2527 2537
[1990] 2559 2672 2689 2697 2717 2726 2732 2752 2776 2864 2865 2944 3103
[2003] 3108 3232 3258 3310  349  458  663 1044 1061 1101 1122 1233 1277
[2016] 1278 1310 1314 1355 1375 1376 1391 1419 1439 1507 1527 1585 1635
[2029] 1642 1705 1708 1711 1734 1744 1747 1758 1763 1913 1923 1927 1961
[2042] 1963 1972 1999 2013 2039 2072 2091 2095 2104 2117 2172 2217 2221
[2055] 2233 2263 2267 2271 2273 2282 2348 2361 2373 2375 2384 2396 2429
[2068] 2445 2463 2475 2506 2531 2548 2607 2780 2825 3025 3068 3335  214
[2081]  432  522  558  569  662  682  704  729  872  925  947  988 1047
[2094] 1129 1159 1165 1190 1297 1338 1377 1394 1405 1438 1515 1537 1540
[2107] 1564 1569 1583 1644 1653 1655 1662 1677 1759 1800 1810 1841 1848
[2120] 1851 1873 1881 1896 1904 1922 1962 2027 2049 2051 2073 2079 2092
[2133] 2112 2123 2165 2191 2196 2200 2205 2229 2289 2295 2339 2366 2391
[2146] 2434 2443 2452 2495 2516 2551 2579 2654 2765 2805 3060  248  271
[2159]  358  361  470  508  751  829  855 1199 1219 1235 1393 1448 1476
[2172] 1511 1529 1556 1602 1616 1630 1645 1685 1750 1787 1797 1805 1813
[2185] 1835 1836 1866 1891 1907 1916 2005 2011 2025 2029 2185 2194 2275
[2198] 2317 2335 2392 2408 2409 2415 2466 2560 2708 2783 2953  125  212
[2211]  416  481  688  692  786  842  861 1018 1026 1088 1092 1111 1153
[2224] 1160 1184 1243 1358 1422 1487 1605 1614 1700 1713 1748 1818 1883
[2237] 1895 1898 1910 1918 1938 2031 2113 2146 2159 2183 2272 2320 2325
[2250] 2471 2485 2497 2564 2571 2719 2749 2807 2828 2909  814  863 1032
[2263] 1100 1142 1157 1182 1256 1285 1286 1289 1290 1296 1354 1410 1474
[2276] 1493 1526 1568 1590 1641 1720 1736 1742 1823 1867 1869 1870 1901
[2289] 1925 2012 2040 2145 2163 2314 2357 2512 2582 2609 2998   24  302
[2302]  351  668  899 1217 1264 1321 1324 1379 1433 1440 1484 1510 1652
[2315] 1666 1726 1729 1749 1751 1756 1757 1832 1876 1878 1944 2059 2068
[2328] 2085 2100 2114 2245 2287 2313 2331 2355 2477 2566 2636  332  353
[2341]  375  482  505  942  949  962 1014 1052 1065 1075 1123 1143 1171
[2354] 1224 1265 1268 1304 1382 1390 1414 1415 1477 1485 1494 1551 1571
[2367] 1613 1682 1707 1740 1745 1754 1880 1919 1977 2006 2014 2084 2135
[2380] 2143 2144 2184 2190 2301 2308 2312 2328 2489 2631 2647 2804  745
[2393]  821  981 1040 1080 1125 1174 1294 1295 1432 1463 1545 1588 1594
[2406] 1725 1769 1773 1868 1890 1892 1997 2008 2038 2181 2238 2311 2341
[2419]  461  753  802  832  836  995 1013 1250 1315 1328 1398 1408 1425
[2432] 1542 1549 1574 1601 1610 1702 1732 1753 1882 1966 1989 2018 2045
[2445] 2054 2090 2129 2236 2306 2346 2354 2433 2517 2529 2750  121  136
[2458]  151  328  460  462  695  830  848  854  923  933 1012 1022 1089
[2471] 1246 1261 1280 1335 1386 1491 1506 1573 1581 1628 1766 1783 1799
[2484] 1819 1854 1915 1939 1979 2107 2109 2187 2237 2360 2368 2487 2493
[2497]  457  498  743  793  795 1226 1331 1345 1362 1482 1495 1544 1567
[2510] 1578 1582 1646 1716 1779 1822 1844 1853 2044 2077 2234 2270 2280
[2523]  295  379  606  748  757  916 1109 1151 1152 1218 1266 1273 1481
[2536] 1587 1600 1627 1733 1760 1764 1795 1985 2250 2418  213  357  628
[2549]  739  844  860  884  896  975 1112 1148 1150 1191 1227 1303 1383
[2562] 1492 1523 1531 1553 1637 1687 1724 1743 1872 1942 1967 1983 1984
[2575] 2101 2519  921 1102 1110 1147 1214 1223 1249 1298 1400 1471 1499
[2588] 1500 1570 1680 1694 1776 1791 2026 2178   47  642  649  858 1006
[2601] 1107 1124 1188 1267 1275 1357 1369 1406 1459 1465 1509 1596 1604
[2614] 1658 1671 1693 1801 1820 1845 1899 1994 2028 2070 2110 2162  130
[2627]  356  468  880 1051 1079 1232 1353 1366 1543 1678 1722 1889 1969
[2640] 2097 2284 2321 2344  167  394  427  554  654  809  869 1050 1059
[2653] 1229 1346 1350 1368 1370 1420 1464 1497 1522 1538 1562 1681 1686
[2666] 1809 1877 1986  292  350  475  588  644  647  904  979 1173 1222
[2679] 1240 1306 1349 1351 1417 1434 1552 1555 1559 1643 1675 1730 1741
[2692] 1830 1837  620  811 1046 1172 1247 1283 1455 1502 1575 1626 1803
[2705] 1863  301  395  607  825  852  963 1132 1139 1279 1365 1373 1802
[2718]  631 1054 1134 1489 1505 1738 1774 1859 1917 1932 1996  504  969
[2731] 1007 1144 1595 1703 1924 1931 2007  376  526  799  886  970 1021
[2744] 1037 1483 1679 1888  382  693  897  928  941 1077 1145 1216 1254
[2757] 1282 1843  194  344  773  954  986 1058 1098 1127 1452 1458 1514
[2770] 1597 1865 1974 2023 2203  157  645  684  724  774 1330 1332 1396
[2783] 1453 1517 1536 1993  476  496  850  870  876  927 1241 1347 1367
[2796] 1561 1706 1718  734  750  816  910 1242 1333 1470 1829  610  782
[2809]  820  874  875  889  982 1025 1062 1155 1251 1329 1539 1547 1550
[2822] 1609 1624 1804 1875 2060  180  285  576  597  831  906  937 1120
[2835] 1205 1591 1731 1767 1834  116  708  843 1078 1179 1533 1580 1584
[2848] 1622 1689 1701  225  249  373  525  672  725  985 1031 1146 1189
[2861] 1209 1211 1248 1341 1342 1413 1947  211  771  882  973 1121 1360
[2874]  720  983 1073 1103 1162 1170 1276 1339  622  898 1468 1618  221
[2887]  887  911  932 1038 1082 1128 1210 1221 1322 1385 1541 1665  203
[2900]  987 1305 1340 1384 1608  234  784  885  971 1176 1203 1318 1352
[2913] 1563  156  499  993 1027 1057 1404  337  412  667  747  792 1838
[2926]  274  518  537  544  669  775 1108 1228 1274 1592  385  507  767
[2939]  772 1001 1035 1443 1714   40  196  656  952 1030 1212 1234 1287
[2952] 1309 1317 1327 1412 1504  847  912  948 1015 1183 1462 1548  370
[2965]  834  857 1181 1392 1498 1683 1839  312  817 1053 1302 1660  652
[2978]  794  974 1126 1316 1639  220  619  929  944 1010 1270   46  282
[2991]  617  630  779  845 1055 1149 1198 1615  231  466  730  920 1005
[3004] 1023 1028 1033 1292  601  752  905  955 1029 1185 1426 1909  402
[3017]  665 1069 1177 1237 1603  108  490  637 1166 1178 1364 1409 1478
[3030] 1971  304  534  761  777  965 1093 1138  154  493  538 1045 1196
[3043] 1371  277  483  900 1106 1119  173  387  435  713  800 1168  827
[3056]  894  926 1288 1611  340  789  980 1003 1253 1257 1397 1524  322
[3069]  805 1194 1284 1411  640  778  818 1136 1281  406  574  594  960
[3082] 1041 1137  658 1039 1238  397  615  966 1215 1299 1546  126   73
[3095]  431 1067  334  653  999 1195  763  314  559  609  675  741  839
[3108] 1154 1372  469  355  580  648  721 1096 1490  826 1225 1291  550
[3121]  613  722  871 1133  680  917  957 1072 1085  270  592  596  706
[3134]  943 1441 1442  500  819 1060 1131 1416 1020 1712  879  990 1258
[3147]  218  815 1175  892 1186  429  797  946 1036 1197  109  740  710
[3160] 1202 1778  216  471  506  670 1236 1271 1437  531  542  856  245
[3173]  399  629  634 1086 1649 1163  503  687  828 1239  426 1402  497
[3186]  717  759 1095  300  369  972 1097   37  241  541  259  519 1187
[3199] 1231  749 1011  343  585  769  208  714  958 1002  864  914  961
[3212]  286  586  571  922  291  403  737 1156  423  785  807  940  389
[3225]  492  513  664  919   25  627 1399  391  445  473  527  587  770
[3238]    3  297  512  715  808  329  540  945 1230  143  589  837  299
[3251]  366  908  731  903  918 1081  335  853 1381 1454  984  123  838
[3264]  557  768  451  415  511  517  840 1323  533  565  862  810  867
[3277]  733  959 1167  264  674  553  951  689  707  736  924  303  446
[3290]  643  859  325  566  788  425  562  666  787 1016 1084 1252  276
[3303]  444  501  728  760  883  581 1356   79  489  701  396  455  909
[3316]  142  676  881  393  950  404  632  994  486  159 1501  114  552
[3329]  551  267  579  835  474 1056  326  524  686  790  246  333  573
[3342]  803  804  901 1192  347  780  275  732  454  599  796  514  155
[3355]  417  268  590  560  584  877  520  755  976  681  307  646  939
[3368]  529  535  956 1161  348  624  754  841  878  272  873  938  798
[3381]  977  516 1090  735  515  247  345  321  491  281  410  776  346
[3394]  691  685  621  623  783  439  364  124  384  441  659  833  411
[3407]  582  764  380  189  336  368  430  181  407  184  705  131  690
[3420]  703  866  223  252  374  997  625  308  532  746  641  378  278
[3433]  448  536   95  168  362  556  781  915  591  244  478  868  709
[3446]   63  201  650  572  765  530   87  253  372  638  200  269  539
[3459]  549  603  726  756  913 1091  791  145  222  575  289  626  227
[3472]  342  595  487  256  612  678  288  600  742  161  931  978  893
[3485]  437  226  388  633  636  331  528  523  456  479  616  543  273
[3498]  744  968  386  338  324  766  891  341  115  453  465  964  377
[3511]  243  495  823  459  472  583  185  422  405  443  545  661  463
[3524]  502  198  284    8  413  122  229  217  890  660  323  144  555
[3537]  127  166  561  319  657  428  727  279  414  716  567  179  139
[3550]  398  605  673  197  141 1066  251  318  320  614  806  260  339
[3563]  158  287  464  162  242  548  311  240  480  608  236  438  327
[3576]  296  381  110   59  418  255  467  306  484  175  330  611   26
[3589]  317  598  254  568  258  165  651   68  178  359   90  309  105
[3602]  205  578   32  280  593  510  485  207  371  409  262  199  367
[3615]  150  392  723  683  360  315  639  442  421  137  202  895  294
[3628]  547  219   51  363   41  186  436  452  263  107  310  169   36
[3641]  365   74  257  209  171  290  239   99  400  440   92  238  305
[3654]  261  224   45  477   19  390  235   66  298  101  230  546   53
[3667]  265  408  170  146  149  172   78  118  237  449  103   65  128
[3680]  177  106  215  250   49  147   71  160   93  120  433   31  434
[3693]  133   70   67  132  148  163   83   56  228  134  401  112  316
[3706]  152  119  191   89   80   85  233  187   82  113   33   43   35
[3719]  283   61  193   94   16  192   88   50  293   72  174  232  129
[3732]   13   34   96  266   55  206  183  153  190  204   54  117  111
[3745]   48  509  140  176  383   17   86   22   77  164   69   30  135
[3758]   91   97  102  100   39  182   75    2   58    9   15  195   44
[3771]   64   38  138   21   42   76   81    4  188   62   18   52    6
[3784]    5   28   84    1   14   20   60   98  104   29    7   57   23
[3797]   27   10   12   11

We can export the matrix (to see how it looks) to an excel file

dim(m) write.csv(as.matrix(frequencies), file = 'matrix.csv')

Let’s check the most and the least frequent words

freq[head(ord)]
   coscar membrillo    cfrank      wish muscavado   csheep’ 
        1         1         1         1         1         1 
freq[tail(ord)]
 fresh garlic  onion    oil   salt pepper 
 17614  18504  19208  23303  24177  26189 

Removing terms that are sparsed, that is those that occur less than in 4 recipes (text documents).

sparse <- removeSparseTerms(frequencies, 1 - 3/nrow(frequencies))
dim(sparse)
[1] 39774  2283

2.1 Visualisation

We create a data frame for visualization.

wf <- data.frame(word = names(freq), freq = freq)
head(wf)
         word freq
black   black 9663
corn     corn 5194
cplain cplain  120
egg       egg 7952
flour   flour 8618
green   green 8383

Plotting histogram that shows terms that appear more than 10000 times in recipes.

chart <- ggplot(subset(wf, freq >10000), aes(x = word, y = freq))
chart <- chart + geom_bar(stat = 'identity', color = 'black', fill = 'white')
chart <- chart + theme(axis.text.x=element_text(angle=45, hjust=1))
chart

We can also determine the terms that mostly associates with the term pepper.

findAssocs(frequencies, c('pepper'), corlimit=0.30)
$pepper
  bell    red  black garlic  onion cayenn ground 
  0.53   0.48   0.43   0.32   0.32   0.31   0.30 

Creating a dataframe from matrix sparse.

newsparse <- as.data.frame(as.matrix(sparse))
dim(newsparse)
[1] 39774  2283

We populate newsparse dataframe cuisine variable with combi$cuisine values.

newsparse$cuisine <- as.factor(combi$cuisine)

After all the data engineering we split our dataset again into train and test.

mytrain <- newsparse[1:nrow(train),]
mytest <- newsparse[-(1:nrow(train)),]

3 XGBoost Model

Creating two objects for train and test that would store our sparse matrix and target variable.

ctrain <- xgb.DMatrix(Matrix(data.matrix(mytrain[,!colnames(mytrain) %in% c('cuisine')])), label = as.numeric(mytrain$cuisine)-1)
dtest <- xgb.DMatrix(Matrix(data.matrix(mytest[,!colnames(mytest) %in% c('cuisine')])), label = as.numeric(mytest$cuisine)-1) 

Creating watchlist to see the intermediate results

watchlist <- list(train = ctrain, test = dtest)
str(ctrain)
Class 'xgb.DMatrix' <externalptr> 
 - attr(*, ".Dimnames")=List of 2
  ..$ : NULL
  ..$ : chr [1:2283] "black" "corn" "cplain" "egg" ...
length(unique(train$cuisine))
[1] 20

For xgboost we choose our objective to be multi:softmax, which will do multiclass classification using the softmax objective. We also need to set num_class(number of classes) to the number of cuisine, which is 20. As out evaluation metric we choose auc (area under the curve).

bst <- xgb.train(data=ctrain,
                 objective="multi:softmax",
                 eval_metric="auc",
                 max_depth=2,
                 num_class=20,
                 nrounds = 300,
                 verbose = 1,
                 # watchlist = watchlist,
                 lambda=0.01)

Now we are going to use our model to classify recipes in test dataset by cuisine.

pred <- predict(bst,dtest,type="class")

4 Evaluation and Conclusion

# auc(predicted = pred, actual = mytest$cuisine)
a <- table(Predicted = pred, Actual = mytest$cuisine)
sum(diag(a))/sum(a)
[1] 0.7696695
 

A work by YOUR NAME

YOUREMAIL@gmail.com