본문 바로가기

DACON

[DACON] 영화 관객수 예측 모델 개발 (R, lightGBM)

 

https://dacon.io/competitions/open/235536/overview/description

 

[문화] 영화 관객수 예측 모델 개발 - DACON

데이터 사이언스, 머신러닝을 처음 접하시거나,연습용으로 데이콘 대회를 경험해보고 싶으신 분들은 영화 관객수 예측을 시작하세요! 데이터를 다운받으신후 튜토리얼 코드(코드공유)와 동영

dacon.io

 

EDA 생략된 처리 코드만 (test score: 500269.8041044748)

rm(list = ls())

library(lightgbm)
library(tidyverse)
library(data.table)
library(fastDummies)
library(MLmetrics)
library(lubridate)

train <- fread('movies_train.csv', encoding = 'UTF-8', na.strings = c(''))
test <- fread('movies_test.csv', encoding = 'UTF-8', na.strings = c(''))
submission <- fread('submission.csv', encoding = 'UTF-8')


glimpse(train)
glimpse(test)
glimpse(submission)

summary(train)

sapply(train, function(x) length(unique(x)))

sapply(train, function(x) nrow(train) - length(unique(x)))

unique(train$distributor)
unique(train$genre)
unique(train$screening_rat)
unique(train$director)
unique(train$dir_prev_num)


train$year <- year(train$release_time)
train$month <- month(train$release_time)


train <- train[, -c('distributor', 'title', 'release_time', 'director')]

train <- train %>% replace_na(list(dir_prev_bfnum = 0))




train$genre <- as.numeric(factor(train$genre, levels = unique(train$genre)))

train$screening_rat <- as.numeric(factor(train$screening_rat, levels = unique(train$screening_rat)))


# test

test$year <- year(test$release_time)
test$month <- month(test$release_time)


test <- test[, -c('distributor', 'title', 'release_time', 'director')]

test <- test %>% replace_na(list(dir_prev_bfnum = 0))

test$genre <- as.numeric(factor(test$genre, levels = unique(train$genre)))

test$screening_rat <- as.numeric(factor(test$screening_rat, levels = unique(train$screening_rat)))


# model

X <- train[, -'box_off_num']
y <- log1p(train$box_off_num)

dtrain <- lgb.Dataset(data = as.matrix(X), label = y)

set.seed(1)


max_depth <- 3:10
num_leaves <- 2 ^ max_depth - 1

params_table <- data.frame(max_depth, num_leaves)
params_table$best_iter <- 0
params_table$best_score <- 0

for (p in 1:nrow(params_table)) {
  
  params <- list(objective = 'regression',
                 max_depth = params_table$max_depth[p],
                 num_leaves = params_table$num_leaves[p]
  )
  
  set.seed(1)
  
  lgb_model_cv <- lgb.cv(data = dtrain, 
                         nfold = 10, 
                         nrounds = 100, 
                         params = params, 
                         verbose = -1)
  
  params_table$best_iter[p] <- lgb_model_cv$best_iter
  params_table$best_score[p] <- lgb_model_cv$best_score
  
  cat(p,':', lgb_model_cv$best_iter, lgb_model_cv$best_score, '\n')
}




params_table

params_table[which.min(params_table$best_score),]

params_table %>% ggplot(aes(x = max_depth, y = best_score)) + geom_point()

best_case <- which.min(params_table$best_score)
best_iter <- params_table$best_iter[best_case]

best_params <- list(objective = 'regression',
                    max_depth = params_table$max_depth[best_case],
                    num_leaves = params_table$num_leaves[best_case]
)

set.seed(1)
lgb_model_final <- lgb.train(data = dtrain,
                             nrounds = best_iter,
                             params = best_params)

lgb.plot.importance(lgb.importance(lgb_model_final, percentage = T))
pred_train <- predict(lgb_model_final, as.matrix(X))
MAE(expm1(pred_train), expm1(y))
RMSE(expm1(pred_train), expm1(y))


pred_test <- expm1(predict(lgb_model_final, as.matrix(test)))
pred_test

submission$box_off_num <- pred_test