-
Notifications
You must be signed in to change notification settings - Fork 15
/
heritage.R
104 lines (80 loc) · 2.64 KB
/
heritage.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
########################################
# Example GBM model for HHP
# scores ~ 0.4635 on leaderboard
# which would be 55th position of 510
# as at 9th Sept 2011
#
# Requires the data having been prepared
# using the SQL supplied
#
########################################
starttime <- proc.time()
########################################
#load the data
########################################
alldata <- read.csv("modeling_set1.csv")
########################################
# arrange the data
########################################
#work around a weird behavior, MemberID_t becomes X.MemberID_t after loading somehow
colnames(alldata)[1]='MemberID_t'
#identify train and leaderboard data
trainrows <- which(alldata$trainset == 1)
scorerows <- which(alldata$trainset == 0)
#sanity check the size of each set
length(trainrows)
length(scorerows)
#display the column names
colnames(alldata)
#memberid is required as key for submission set
memberid <- alldata[scorerows,'MemberID_t']
#remove redundant fields
alldata$MemberID_t <- NULL
alldata$YEAR_t <- NULL
alldata$trainset <- NULL
#target - what we are predicting
theTarget <- 'DaysInHospital'
#put the target on the log scale
alldata[trainrows,theTarget] <- log1p(alldata[trainrows,theTarget])
#find the position of the target
targindex <- which(names(alldata)==theTarget)
########################################
# build the model
########################################
#GBM model settings, these can be varied
GBM_NTREES = 500
GBM_SHRINKAGE = 0.05
GBM_DEPTH = 4
GBM_MINOBS = 50
#build the GBM model
library(gbm)
GBM_model <- gbm.fit(
x = alldata[trainrows,-targindex]
,y = alldata[trainrows,targindex]
,distribution = "gaussian"
,n.trees = GBM_NTREES
,shrinkage = GBM_SHRINKAGE
,interaction.depth = GBM_DEPTH
,n.minobsinnode = GBM_MINOBS
,verbose = TRUE)
#list variable importance
summary(GBM_model,GBM_NTREES)
#predict for the leaderboard data
prediction <- predict.gbm(object = GBM_model
,newdata = alldata[scorerows,-targindex]
,GBM_NTREES)
#put on correct scale and cap
prediction <- expm1(prediction)
prediction <- pmin(15,prediction)
prediction <- pmax(0,prediction)
#plot the submission distribution
hist(prediction, breaks=500)
########################################
#write the submission to file
########################################
submission <- cbind(memberid,prediction)
colnames(submission) <- c("MemberID","DaysInHospital")
fnname <- "GBM_demo1.csv"
write.csv(submission, file=fnname, row.names = FALSE)
elapsedtime <- proc.time() - starttime
cat("\nFinished\n",elapsedtime)