import pandas as pd
import matplotlib.pyplot as plt

train = pd.read_csv('movies_train.csv')
test = pd.read_csv('movies_test.csv')


train.head()


train.describe()


train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           600 non-null    object 
 1   distributor     600 non-null    object 
 2   genre           600 non-null    object 
 3   release_time    600 non-null    object 
 4   time            600 non-null    int64  
 5   screening_rat   600 non-null    object 
 6   director        600 non-null    object 
 7   dir_prev_bfnum  270 non-null    float64
 8   dir_prev_num    600 non-null    int64  
 9   num_staff       600 non-null    int64  
 10  num_actor       600 non-null    int64  
 11  box_off_num     600 non-null    int64  
dtypes: float64(1), int64(5), object(6)
memory usage: 56.4+ KB


plt.figure(figsize=(20, 6))

# genre
plt.subplot(221)
train_group_genre = train.groupby('genre').mean()
plt.plot(train_group_genre['box_off_num'])
plt.grid()
plt.rc('font', family='Gulim')

# screening_rat
plt.subplot(222)
train_group_screen = train.groupby('screening_rat').mean()
plt.plot(train_group_screen['box_off_num'])
plt.grid()

# year
plt.subplot(223)
train['year'] = train['release_time'].apply(lambda x: int(x[:4]))
train_group_year = train.groupby('year').mean()
plt.plot(train_group_year['box_off_num'])
plt.grid()

# distributor
plt.subplot(224)
d_top5 = train['distributor'].value_counts()[:5]
def distributor_top5(distributor):
    if distributor not in d_top5:
        return '기타'
    else:
        return distributor
        
train['distributor'] = train['distributor'].apply(distributor_top5)
test['distributor'] = test['distributor'].apply(distributor_top5)

train_group_dis = train.groupby('distributor').mean()
plt.plot(train_group_dis['box_off_num'])
        
plt.show()


train[train['screening_rat']=='전체 관람가']


import seaborn as sns

sns.heatmap(train.corr(), annot=True)
plt.show()


prev_mean = train.groupby(['distributor', 'genre']).mean()
prev_mean.reset_index(inplace=True)

prev_mean.fillna(0, inplace=True)
prev_mean.isnull().sum()
# train[(train.genre == '뮤지컬') & (train.distributor == '기타')]

distributor       0
genre             0
time              0
dir_prev_bfnum    0
dir_prev_num      0
num_staff         0
num_actor         0
box_off_num       0
year              0
dtype: int64


# dir_prev_bfnum 컬럼 null 값에다가 (장르,배급사)의 평균 값으로 채우기
def myFillna(df): 
    try:
        if df.isnull().sum() == 0:
            return df.dir_prev_bfnum
        else:
            value = float(prev_mean[(prev_mean['genre']==df.genre) & (prev_mean['distributor']==df.distributor)]['dir_prev_bfnum'])
            # print(value)
            return value
    except:
        return 0
    
# temp = train[['genre', 'dir_prev_bfnum']].apply(myFillna, axis=1)
train.dir_prev_bfnum = train[['genre', 'distributor','dir_prev_bfnum']].apply(myFillna, axis=1)
test.dir_prev_bfnum = test[['genre', 'distributor', 'dir_prev_bfnum']].apply(myFillna, axis=1)


# X_train = train[['time', 'screening_rat', 'num_staff', 'genre', 'distributor']]
# X_test = test[['time', 'screening_rat', 'num_staff', 'genre', 'distributor']]
X_train = train[['time', 'num_staff', 'genre', 'screening_rat', 'distributor', 'dir_prev_bfnum']]
X_test = test[['time', 'num_staff', 'genre', 'screening_rat', 'distributor', 'dir_prev_bfnum']]

y_train = train['box_off_num']

X_test.head()


X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)


from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=200)
model.fit(X_train, y_train)

RandomForestRegressor(n_estimators=200)


pre = model.predict(X_test)
pre[:5]

array([2305554.63 , 1467952.33 , 1752211.165, 1701600.515, 1273396.345])


sub = pd.read_csv('./submission.csv')
sub['box_off_num'] = pre

sub


sub.to_csv('output.csv', index=False)

	title	distributor	genre	release_time	time	screening_rat	director	dir_prev_bfnum	dir_prev_num	num_staff	num_actor	box_off_num
0	개들의 전쟁	롯데엔터테인먼트	액션	2012-11-22	96	청소년 관람불가	조병옥	NaN	0	91	2	23398
1	내부자들	(주)쇼박스	느와르	2015-11-19	130	청소년 관람불가	우민호	1161602.50	2	387	3	7072501
2	은밀하게 위대하게	(주)쇼박스	액션	2013-06-05	123	15세 관람가	장철수	220775.25	4	343	4	6959083
3	나는 공무원이다	(주)NEW	코미디	2012-07-12	101	전체 관람가	구자홍	23894.00	2	20	6	217866
4	불량남녀	쇼박스(주)미디어플렉스	코미디	2010-11-04	108	15세 관람가	신근호	1.00	1	251	2	483387

	time	dir_prev_bfnum	dir_prev_num	num_staff	num_actor	box_off_num
count	600.000000	2.700000e+02	600.000000	600.000000	600.000000	6.000000e+02
mean	100.863333	1.050443e+06	0.876667	151.118333	3.706667	7.081818e+05
std	18.097528	1.791408e+06	1.183409	165.654671	2.446889	1.828006e+06
min	45.000000	1.000000e+00	0.000000	0.000000	0.000000	1.000000e+00
25%	89.000000	2.038000e+04	0.000000	17.000000	2.000000	1.297250e+03
50%	100.000000	4.784236e+05	0.000000	82.500000	3.000000	1.259100e+04
75%	114.000000	1.286569e+06	2.000000	264.000000	4.000000	4.798868e+05
max	180.000000	1.761531e+07	5.000000	869.000000	25.000000	1.426277e+07

	title	distributor	genre	release_time	time	screening_rat	director	dir_prev_bfnum	dir_prev_num	num_staff	num_actor	box_off_num	year
3	나는 공무원이다	(주)NEW	코미디	2012-07-12	101	전체 관람가	구자홍	23894.0	2	20	6	217866	2012
6	길위에서	기타	다큐멘터리	2013-05-23	104	전체 관람가	이창재	NaN	0	32	5	53526	2013
8	1789, 바스티유의 연인들	기타	뮤지컬	2014-09-18	129	전체 관람가	정성복	NaN	0	3	5	4778	2014
17	별이 빛나는 밤	기타	드라마	2012-04-05	98	전체 관람가	린슈유	773.0	1	8	4	5693	2012
32	다이노 타임	CJ 엔터테인먼트	애니메이션	2015-04-30	85	전체 관람가	최윤석	NaN	0	10	8	285084	2015
...	...	...	...	...	...	...	...	...	...	...	...	...	...
578	메밀꽃, 운수 좋은 날, 그리고 봄봄	기타	애니메이션	2014-08-21	90	전체 관람가	안재훈	53235.0	1	167	7	35567	2014
580	정글히어로	CJ 엔터테인먼트	애니메이션	2014-10-02	82	전체 관람가	박태동	NaN	0	0	6	72052	2014
582	뽀로로 극장판 컴퓨터 왕국 대모험	(주)NEW	애니메이션	2015-12-10	62	전체 관람가	박영균	NaN	0	2	8	446054	2015
585	후쿠시마의 미래	기타	다큐멘터리	2015-04-09	70	전체 관람가	이홍기	NaN	0	4	1	938	2015
587	서유기 리턴즈	기타	SF	2011-02-17	79	전체 관람가	신재호	67602.0	1	220	4	12696	2011

	time	num_staff	genre	screening_rat	distributor	dir_prev_bfnum
0	125	304	느와르	청소년 관람불가	기타	3.005290e+05
1	113	275	멜로/로맨스	12세 관람가	(주)쇼박스	3.427002e+05
2	115	419	드라마	12세 관람가	CJ 엔터테인먼트	4.206611e+06
3	116	408	액션	15세 관람가	(주)쇼박스	6.913420e+05
4	110	380	공포	15세 관람가	CJ 엔터테인먼트	3.173800e+04

	title	box_off_num
0	용서는 없다	2305554.630
1	아빠가 여자를 좋아해	1467952.330
2	하모니	1752211.165
3	의형제	1701600.515
4	평행 이론	1273396.345
...	...	...
238	해에게서 소년에게	4226.065
239	울보 권투부	5692.110
240	어떤살인	320057.235
241	말하지 못한 비밀	6102.555
242	조선안방 스캔들-칠거지악 2	1323.480

Load Data¶

EDA¶

데이터 전처리¶

모델 학습¶

Predict¶