data science/machine learning

Naïve Bayes (나이브 배이스) Classifier - 실전 2

꼰대코더 2025. 3. 8. 16:35
import numpy as np
import pandas as pd

df = pd.read_csv(data_path, header=None, sep='::', engine='python')
df.columns = ['user_id', 'movie_id', 'rating', 'timestamp']

 

n_users = df['user_id'].nunique()
n_movies = df['movie_id'].nunique()

def load_user_rating_data(df, n_users, n_movies):
    data = np.zeros([n_users, n_movies], dtype=np.intc)
    movie_id_mapping = {}
    for user_id, movie_id, rating in zip(df['user_id'], df['movie_id'], df['rating']):
        user_id = int(user_id) - 1
        if movie_id not in movie_id_mapping:
            movie_id_mapping[movie_id] = len(movie_id_mapping)
        data[user_id, movie_id_mapping[movie_id]] = rating
    return data, movie_id_mapping

data, movie_id_mapping = load_user_rating_data(df, n_users, n_movies)

# rating별 통계계
values, counts = np.unique(data, return_counts=True)
for value, count in zip(values, counts):
    print(f'Number of rating {value}: {count}')

# movie_id별 통계계
print(df['movie_id'].value_counts())
Number of rating 0: 21384031
Number of rating 1: 56174
Number of rating 2: 107557
Number of rating 3: 261197
Number of rating 4: 348971
Number of rating 5: 226310

2858   3428
260     2991
1196   2990
1210   2883
480     2672
...
3458   1
2226   1
1815   1
398     1
2909   1

Name: movie_id, Length: 3706, dtype: int64

 

movie_id = 2858을 대상으로 추천을 할지 안할지를 알아보자.
즉 데이터셋에서 movie_id = 2858  를 제거한 상태

target_movie_id = 2858
X_raw = np.delete(data, movie_id_mapping[target_movie_id], axis=1)
Y_raw = data[:, movie_id_mapping[target_movie_id]]

X = X_raw[Y_raw > 0]
Y = Y_raw[Y_raw > 0]

print('Shape of X:', X.shape)
print('Shape of Y:', Y.shape)
Shape of X: (3428, 3705)
Shape of Y: (3428,)

 

추천(=1), 비추천(=0)으로 이진화하기 위해 rating을 3을 기준으로 타켓을 변환

recommend = 3
Y[Y <= recommend] = 0
Y[Y > recommend] = 1

n_pos = (Y == 1).sum()
n_neg = (Y == 0).sum()
print(f'{n_pos} positive samples and {n_neg} negative samples.')
2853 positive samples and 575 negative samples.

 

준비된 데이터세을 8:2 로 학습용과 테스트용으로 분리

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print(len(Y_train), len(Y_test))
2742 686

 

Naïve Bayes 트레이닝

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=1.0, fit_prior=True)
clf.fit(X_train, Y_train)

 

Accuracy(정확도) 체크

 
prediction_prob = clf.predict_proba(X_test)
print(prediction_prob[0:10])

prediction = clf.predict(X_test)
print(prediction[:10])
print(Y_test[:10])

accuracy = clf.score(X_test, Y_test)
print(f'The accuracy is: {accuracy*100:.1f}%')
[[7.50487439e-23 1.00000000e+00]
[1.01806208e-01 8.98193792e-01]
[3.57740570e-10 1.00000000e+00]
[1.00000000e+00 2.94095407e-16]
[1.00000000e+00 2.49760836e-25]
[7.62630220e-01 2.37369780e-01]
[3.47479627e-05 9.99965252e-01]
[2.66075292e-11 1.00000000e+00]
[5.88493563e-10 9.99999999e-01]
[9.71326867e-09 9.99999990e-01]]

Predict       -> [1 1 1 1 1 1 0 1 1 1]   10개 샘플중 4개가 틀림
Y_test[:10] -> [1 1 1 0 0 0 1 1 1 1]

The accuracy is: 71.6%