https://dacon.io/competitions/open/235536/overview/description
EDA 생략된 처리 코드만 (test score: 500269.8041044748)
rm(list = ls())
library(lightgbm)
library(tidyverse)
library(data.table)
library(fastDummies)
library(MLmetrics)
library(lubridate)
train <- fread('movies_train.csv', encoding = 'UTF-8', na.strings = c(''))
test <- fread('movies_test.csv', encoding = 'UTF-8', na.strings = c(''))
submission <- fread('submission.csv', encoding = 'UTF-8')
glimpse(train)
glimpse(test)
glimpse(submission)
summary(train)
sapply(train, function(x) length(unique(x)))
sapply(train, function(x) nrow(train) - length(unique(x)))
unique(train$distributor)
unique(train$genre)
unique(train$screening_rat)
unique(train$director)
unique(train$dir_prev_num)
train$year <- year(train$release_time)
train$month <- month(train$release_time)
train <- train[, -c('distributor', 'title', 'release_time', 'director')]
train <- train %>% replace_na(list(dir_prev_bfnum = 0))
train$genre <- as.numeric(factor(train$genre, levels = unique(train$genre)))
train$screening_rat <- as.numeric(factor(train$screening_rat, levels = unique(train$screening_rat)))
# test
test$year <- year(test$release_time)
test$month <- month(test$release_time)
test <- test[, -c('distributor', 'title', 'release_time', 'director')]
test <- test %>% replace_na(list(dir_prev_bfnum = 0))
test$genre <- as.numeric(factor(test$genre, levels = unique(train$genre)))
test$screening_rat <- as.numeric(factor(test$screening_rat, levels = unique(train$screening_rat)))
# model
X <- train[, -'box_off_num']
y <- log1p(train$box_off_num)
dtrain <- lgb.Dataset(data = as.matrix(X), label = y)
set.seed(1)
max_depth <- 3:10
num_leaves <- 2 ^ max_depth - 1
params_table <- data.frame(max_depth, num_leaves)
params_table$best_iter <- 0
params_table$best_score <- 0
for (p in 1:nrow(params_table)) {
params <- list(objective = 'regression',
max_depth = params_table$max_depth[p],
num_leaves = params_table$num_leaves[p]
)
set.seed(1)
lgb_model_cv <- lgb.cv(data = dtrain,
nfold = 10,
nrounds = 100,
params = params,
verbose = -1)
params_table$best_iter[p] <- lgb_model_cv$best_iter
params_table$best_score[p] <- lgb_model_cv$best_score
cat(p,':', lgb_model_cv$best_iter, lgb_model_cv$best_score, '\n')
}
params_table
params_table[which.min(params_table$best_score),]
params_table %>% ggplot(aes(x = max_depth, y = best_score)) + geom_point()
best_case <- which.min(params_table$best_score)
best_iter <- params_table$best_iter[best_case]
best_params <- list(objective = 'regression',
max_depth = params_table$max_depth[best_case],
num_leaves = params_table$num_leaves[best_case]
)
set.seed(1)
lgb_model_final <- lgb.train(data = dtrain,
nrounds = best_iter,
params = best_params)
lgb.plot.importance(lgb.importance(lgb_model_final, percentage = T))
pred_train <- predict(lgb_model_final, as.matrix(X))
MAE(expm1(pred_train), expm1(y))
RMSE(expm1(pred_train), expm1(y))
pred_test <- expm1(predict(lgb_model_final, as.matrix(test)))
pred_test
submission$box_off_num <- pred_test