Language_Detection.knit

Importing the Libraries

library(reticulate)
use_python("C:/Users/AKM098/AppData/Local/anaconda3/python.exe")

use_virtualenv("r-reticulate")

## Warning: Previous request to
## `use_python("C:/Users/AKM098/AppData/Local/anaconda3/python.exe", required =
## TRUE)` will be ignored. It is superseded by request to
## `use_python("C:/Users/AKM098/OneDrive - Maersk
## Group/Documents/.virtualenvs/r-reticulate/Scripts/python.exe")

getwd()

## [1] "C:/Users/AKM098/OneDrive - Maersk Group/Data_Science/End-to-End-implementation"

setwd("C:\\Users\\AKM098\\OneDrive - Maersk Group\\documents_old\\LLM\\language_detector\\data")

import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

import warnings
warnings.simplefilter("ignore")

# Loading the dataset
data = pd.read_csv("C:\\Users\\AKM098\\OneDrive - Maersk Group\\documents_old\\LLM\\language_detector\\data\\Language Detection.csv")

data.tail(10)

##                                                     Text Language
## 10327  ಗಾಡಿಯಲ್ಲಿ ಮನೆಯಲ್ಲಿ ನಾರ್ಸಿಸ್ ಅವಳು ಮನೆಗೆ ತಲುಪಿದಾ...  Kannada
## 10328  ಅವಳು ಮನೆಯಲ್ಲಿ ಕುಳಿತಿದ್ದ ತನ್ನ ತಾಯಿಯನ್ನು ತಬ್ಬಿಕೊ...  Kannada
## 10329  ಓಹ್ ತಾಯಿ ನಾನು ನಿನ್ನನ್ನು ತುಂಬಾ ಪ್ರೀತಿಸುತ್ತೇನೆ ಮ...  Kannada
## 10330  ಅವನು ಸುಂದರವಾಗಿದ್ದನು, ಅವನು ನನ್ನ ಬಗ್ಗೆ ಅಷ್ಟಾಗಿ ಕ...  Kannada
## 10331          ಓಹ್ ಇಲ್ಲ ನೀವು ನೋಡಿದಾಗಿನಿಂದ ಇದು ಸರಿಯಾಗಿದೆ.  Kannada
## 10332  ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...  Kannada
## 10333  ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...  Kannada
## 10334  ಹೇಗೆ ' ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎ...  Kannada
## 10335  ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...  Kannada
## 10336  ಟೆರ್ರಿ ನೀವು ನಿಜವಾಗಿಯೂ ಆ ದೇವದೂತನಂತೆ ಸ್ವಲ್ಪ ಕಾಣು...  Kannada

# value count for each language
data["Language"].value_counts()

## Language
## English       1385
## French        1014
## Spanish        819
## Portugeese     739
## Italian        698
## Russian        692
## Sweedish       676
## Malayalam      594
## Dutch          546
## Arabic         536
## Turkish        474
## German         470
## Tamil          469
## Danish         428
## Kannada        369
## Greek          365
## Hindi           63
## Name: count, dtype: int64

# separating the independent and dependant features
X = data["Text"]
y = data["Language"]

y[:3]

## 0    English
## 1    English
## 2    English
## Name: Language, dtype: object

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

# converting categorical variables to numerical
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

y[:10]

## array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3])

# creating bag of words using countvectorizer

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(X).toarray()

X.shape

## (10337, 39928)

X[:5]

## array([[0, 0, 0, ..., 0, 0, 0],
##        [0, 0, 0, ..., 0, 0, 0],
##        [0, 0, 0, ..., 0, 0, 0],
##        [0, 0, 0, ..., 0, 0, 0],
##        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

Train Test split

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

x_train.shape

## (8269, 39928)

x_test.shape

## (2068, 39928)

x_test.shape

## (2068, 39928)

Model creation and Prediction

from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(x_train, y_train)

MultinomialNB()

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

# prediction
y_pred = model.predict(x_test)

Evaluating the model

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

ac = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

print("Accuracy is :",ac)

## Accuracy is : 0.9787234042553191

# classification report
print(cr)

##               precision    recall  f1-score   support
## 
##            0       1.00      0.96      0.98       117
##            1       1.00      0.96      0.98        81
##            2       1.00      1.00      1.00       105
##            3       0.88      1.00      0.93       271
##            4       1.00      0.99      0.99       216
##            5       1.00      0.96      0.98        92
##            6       1.00      0.99      0.99        71
##            7       1.00      1.00      1.00        13
##            8       0.99      0.99      0.99       143
##            9       1.00      0.99      0.99        71
##           10       0.98      0.98      0.98       116
##           11       1.00      0.97      0.99       146
##           12       1.00      0.96      0.98       142
##           13       0.99      0.98      0.99       168
##           14       0.99      0.98      0.99       126
##           15       1.00      0.96      0.98        95
##           16       1.00      0.95      0.97        95
## 
##     accuracy                           0.98      2068
##    macro avg       0.99      0.98      0.98      2068
## weighted avg       0.98      0.98      0.98      2068

Model Saving

# saving both cv and model
#pickle.dump(cv, open("transform.pkl", "wb"))
#pickle.dump(model, open("model.pkl", "wb"))

create pipeline

# Loading the dataset
#data = pd.read_csv("Language Detection.csv")

# separating the independent and dependant features
X = data["Text"]
y = data["Language"]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

from sklearn.pipeline import Pipeline, FunctionTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# Custom function to convert sparse matrix to dense array
def to_array(X):
    return X.toarray()

# Define the pipeline steps
steps = [
    ('vectorizer', CountVectorizer()),
    ('to_array', FunctionTransformer(to_array, accept_sparse=True)),  # Convert to dense array
    ('classifier', MultinomialNB())
]

# Create the pipeline
pipe = Pipeline(steps)

# Train the pipeline
pipe.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('to_array',
                 FunctionTransformer(accept_sparse=True,
                                     func=<function to_array at 0x000001DF75652A20>)),
                ('classifier', MultinomialNB())])

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

y_pred = pipe.predict(X_test)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

ac = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

print("Accuracy is :",ac)

## Accuracy is : 0.9840425531914894

print(classification_report(y_test, y_pred))

##               precision    recall  f1-score   support
## 
##       Arabic       1.00      0.97      0.98       116
##       Danish       1.00      0.93      0.96        87
##        Dutch       1.00      0.98      0.99        97
##      English       0.91      1.00      0.95       279
##       French       0.99      1.00      0.99       221
##       German       1.00      0.99      0.99        75
##        Greek       1.00      0.99      0.99        84
##        Hindi       1.00      0.90      0.95        10
##      Italian       1.00      0.99      1.00       136
##      Kannada       1.00      0.95      0.97        77
##    Malayalam       1.00      0.99      1.00       114
##   Portugeese       0.99      1.00      1.00       136
##      Russian       1.00      0.97      0.98       158
##      Spanish       1.00      0.99      0.99       151
##     Sweedish       0.99      1.00      1.00       135
##        Tamil       1.00      1.00      1.00       100
##      Turkish       1.00      0.96      0.98        92
## 
##     accuracy                           0.98      2068
##    macro avg       0.99      0.98      0.98      2068
## weighted avg       0.99      0.98      0.98      2068

#import pickle

#create pickle file
#pickle.dump(pipe, open('tr_pipe_lng_model.0.1.pkl', 'wb'))

#pkl_model = pickle.load(open('tr_pipe_lng_model.0.1.pkl', 'rb'))

val = ['¿Tienes sentido común?']

#spiste morgenmad
pipe.predict(val)

## array(['Spanish'], dtype='<U10')