Diem H. Vuong

xgBoost Model

#Setting Working Directory
setwd("C:/Users/Capstone Project")
getwd()

## [1] "C:/Users/Capstone Project"

#Importing Dataset
library(readr)
flight_dt <- read_csv("Aviation Data/Marketing Project_Flight_data.csv")

## Parsed with column specification:
## cols(
##   CustomerID = col_double(),
##   Gender = col_character(),
##   CustomerType = col_character(),
##   Age = col_double(),
##   TypeTravel = col_character(),
##   Class = col_character(),
##   Flight_Distance = col_double(),
##   DepartureDelayin_Mins = col_double(),
##   ArrivalDelayin_Mins = col_double()
## )

survey_dt <- read_csv("Aviation Data/Marketing Project_Survey_data.csv")

## Parsed with column specification:
## cols(
##   CustomerId = col_double(),
##   Satisfaction = col_character(),
##   Seat_comfort = col_character(),
##   Departure.Arrival.time_convenient = col_character(),
##   Food_drink = col_character(),
##   Gate_location = col_character(),
##   Inflightwifi_service = col_character(),
##   Inflight_entertainment = col_character(),
##   Online_support = col_character(),
##   Ease_of_Onlinebooking = col_character(),
##   Onboard_service = col_character(),
##   Leg_room_service = col_character(),
##   Baggage_handling = col_character(),
##   Checkin_service = col_character(),
##   Cleanliness = col_character(),
##   Online_boarding = col_character()
## )

library(dplyr)
Aviation = left_join(flight_dt, survey_dt, by = c("CustomerID" = "CustomerId"))
clean_names(Aviation)

## # A tibble: 90,917 x 24
##    customer_id gender customer_type   age type_travel class flight_distance
##          <dbl> <chr>  <chr>         <dbl> <chr>       <chr>           <dbl>
##  1      149965 Female Loyal Custom~    65 Personal T~ Eco               265
##  2      149966 Female Loyal Custom~    15 Personal T~ Eco              2138
##  3      149967 Female Loyal Custom~    60 Personal T~ Eco               623
##  4      149968 Female Loyal Custom~    70 Personal T~ Eco               354
##  5      149969 Male   Loyal Custom~    30 <NA>        Eco              1894
##  6      149970 Female Loyal Custom~    66 Personal T~ Eco               227
##  7      149971 Male   Loyal Custom~    10 Personal T~ Eco              1812
##  8      149972 Male   Loyal Custom~    22 Personal T~ Eco              1556
##  9      149973 Female Loyal Custom~    58 Personal T~ Eco               104
## 10      149974 Female Loyal Custom~    34 Personal T~ Eco              3633
## # ... with 90,907 more rows, and 17 more variables:
## #   departure_delayin_mins <dbl>, arrival_delayin_mins <dbl>,
## #   satisfaction <chr>, seat_comfort <chr>,
## #   departure_arrival_time_convenient <chr>, food_drink <chr>,
## #   gate_location <chr>, inflightwifi_service <chr>,
## #   inflight_entertainment <chr>, online_support <chr>,
## #   ease_of_onlinebooking <chr>, onboard_service <chr>,
## #   leg_room_service <chr>, baggage_handling <chr>, checkin_service <chr>,
## #   cleanliness <chr>, online_boarding <chr>

Aviation = Aviation[-c(1)] #column 1 was removed on line 63, now remove Gender column
Aviation <- mutate_if(Aviation, is.character, as.factor)
## Remove Customer Type and Type Travel Column from the Analysis as they both containing 10% missing values.
Aviation = Aviation[-c(2,4)]
## imputing the mean into missing values for Arrival/Departure Delays in Minutes
Aviation$ArrivalDelayin_Mins[is.na(Aviation$ArrivalDelayin_Mins)] = mean(Aviation$ArrivalDelayin_Mins, na.rm = TRUE) 
## Create mode function 
dt_mode <- function(x) {                                     
  unique_x <- unique(x)
  mode <- unique_x[which.max(tabulate(match(x, unique_x)))]
  mode
}
# imputing the mode into surveyed missing values
Aviation$Departure.Arrival.time_convenient[is.na(Aviation$Departure.Arrival.time_convenient)] <-                        dt_mode(Aviation$Departure.Arrival.time_convenient[!is.na(Aviation$Departure.Arrival.time_convenient)]) 
Aviation$Food_drink[is.na(Aviation$Food_drink)] <- dt_mode(Aviation$Food_drink[!is.na(Aviation$Food_drink)]) 
Aviation$Onboard_service[is.na(Aviation$Onboard_service)] <- dt_mode(Aviation$Onboard_service[!is.na(Aviation$Onboard_service)])

# Converting character variables to integers 
Aviation$Gender <- as.numeric(ifelse(Aviation$Gender == "Male", 1, 0))
Aviation$Class <- as.numeric(ifelse(Aviation$Class == "Business",1, 
                             ifelse(Aviation$Class == "Eco Plus",2, 3 )))

Aviation[c(8:21)] <- as.numeric(ifelse(Aviation[c(8:21)] == "extremely poor", 0, 
                             ifelse(Aviation[c(8:21)] == "poor", 1,
                             ifelse(Aviation[c(8:21)] == "need improvement", 2,
                             ifelse(Aviation[c(8:21)] == "acceptable", 3,
                             ifelse(Aviation[c(8:21)] == "good", 4, 5))))))

Label Conversion and Convert dataset to Dataframe

library(xgboost)

## 
## Attaching package: 'xgboost'

## The following object is masked from 'package:dplyr':
## 
##     slice

# Convert the Satisfaction factor to an integer class starting at 0
# This is picky, but it's a requirement for XGBoost
xgbAvi = Aviation
xgbAvi$Prediction = xgbAvi$Satisfaction
xgbAvi = xgbAvi[-c(7)]
xgbAvi$Satisfaction = xgbAvi$Prediction
xgbAvi = xgbAvi[-c(21)]
xgbAvi = as.data.frame(xgbAvi) #Converting dataset to dataframe 
str(xgbAvi)

## 'data.frame':    90917 obs. of  21 variables:
##  $ Gender                           : num  0 0 0 0 1 0 1 1 0 0 ...
##  $ Age                              : num  65 15 60 70 30 66 10 22 58 34 ...
##  $ Class                            : num  3 3 3 3 3 3 3 3 3 3 ...
##  $ Flight_Distance                  : num  265 2138 623 354 1894 ...
##  $ DepartureDelayin_Mins            : num  0 0 0 0 0 17 0 30 47 0 ...
##  $ ArrivalDelayin_Mins              : num  0 0 0 0 0 15 0 26 48 0 ...
##  $ Seat_comfort                     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Departure.Arrival.time_convenient: num  0 0 4 0 0 0 0 4 0 0 ...
##  $ Food_drink                       : num  0 0 0 0 0 3 3 0 0 0 ...
##  $ Gate_location                    : num  2 5 5 5 5 5 5 5 5 5 ...
##  $ Inflightwifi_service             : num  2 2 3 4 2 2 2 2 3 2 ...
##  $ Inflight_entertainment           : num  4 0 4 3 0 5 0 0 3 0 ...
##  $ Online_support                   : num  2 2 3 4 2 5 2 2 3 2 ...
##  $ Ease_of_Onlinebooking            : num  3 2 1 2 2 5 2 2 3 2 ...
##  $ Onboard_service                  : num  3 4 1 2 5 5 3 2 3 3 ...
##  $ Leg_room_service                 : num  0 3 0 0 4 0 3 4 0 2 ...
##  $ Baggage_handling                 : num  3 4 1 2 5 5 4 5 1 5 ...
##  $ Checkin_service                  : num  5 4 4 4 5 5 5 3 2 2 ...
##  $ Cleanliness                      : num  3 4 1 2 4 5 4 4 3 5 ...
##  $ Online_boarding                  : num  2 2 3 5 2 3 2 2 5 2 ...
##  $ Satisfaction                     : Factor w/ 2 levels "neutral or dissatisfied",..: 2 2 2 2 2 2 2 2 2 2 ...

Split the data for training and testing (75/25 split)

#### **Split the data for training and testing (75/25 split)**
indexes = createDataPartition(xgbAvi$Satisfaction, p = 0.75, list = F)
train = xgbAvi[indexes, ]
test = xgbAvi[-indexes, ]

train.data = data.matrix(train[,-21])
train.label = train[,21]
 
test.data = data.matrix(test[,-21])
test.label = test[,21]

Create the xgb.DMatrix objects

# Transform the two data sets into xgb.Matrix
xgb.train = xgb.DMatrix(data=train.data,label=train.label)
xgb.test = xgb.DMatrix(data=test.data,label=test.label)

Define the model

# We can define the xgboost model with xgboost function with changing some of the parameters. Note that xgboost is a training function, thus we need to include the train data too. Once we run the function, it fits the model with training data.
xgbModel = xgboost(data = xgb.train, max.depth=3, nrounds=50)

## [1]  train-rmse:0.844626 
## [2]  train-rmse:0.634427 
## [3]  train-rmse:0.499091 
## [4]  train-rmse:0.413770 
## [5]  train-rmse:0.362687 
## [6]  train-rmse:0.330878 
## [7]  train-rmse:0.312292 
## [8]  train-rmse:0.301877 
## [9]  train-rmse:0.296154 
## [10] train-rmse:0.292354 
## [11] train-rmse:0.288608 
## [12] train-rmse:0.285500 
## [13] train-rmse:0.284341 
## [14] train-rmse:0.282622 
## [15] train-rmse:0.281670 
## [16] train-rmse:0.280383 
## [17] train-rmse:0.279421 
## [18] train-rmse:0.278295 
## [19] train-rmse:0.277515 
## [20] train-rmse:0.276470 
## [21] train-rmse:0.275707 
## [22] train-rmse:0.274822 
## [23] train-rmse:0.274468 
## [24] train-rmse:0.273616 
## [25] train-rmse:0.273212 
## [26] train-rmse:0.272497 
## [27] train-rmse:0.271886 
## [28] train-rmse:0.271041 
## [29] train-rmse:0.270028 
## [30] train-rmse:0.269354 
## [31] train-rmse:0.268568 
## [32] train-rmse:0.268209 
## [33] train-rmse:0.267856 
## [34] train-rmse:0.267388 
## [35] train-rmse:0.266989 
## [36] train-rmse:0.266528 
## [37] train-rmse:0.265093 
## [38] train-rmse:0.263613 
## [39] train-rmse:0.262243 
## [40] train-rmse:0.261973 
## [41] train-rmse:0.261679 
## [42] train-rmse:0.261491 
## [43] train-rmse:0.261183 
## [44] train-rmse:0.260925 
## [45] train-rmse:0.260741 
## [46] train-rmse:0.260610 
## [47] train-rmse:0.260008 
## [48] train-rmse:0.259328 
## [49] train-rmse:0.259093 
## [50] train-rmse:0.258834

print(xgbModel)

## ##### xgb.Booster
## raw: 33.8 Kb 
## call:
##   xgb.train(params = params, data = dtrain, nrounds = nrounds, 
##     watchlist = watchlist, verbose = verbose, print_every_n = print_every_n, 
##     early_stopping_rounds = early_stopping_rounds, maximize = maximize, 
##     save_period = save_period, save_name = save_name, xgb_model = xgb_model, 
##     callbacks = callbacks, max.depth = 3)
## params (as set within xgb.train):
##   max_depth = "3", silent = "1"
## xgb.attributes:
##   niter
## callbacks:
##   cb.print.evaluation(period = print_every_n)
##   cb.evaluation.log()
## # of features: 20 
## niter: 50
## nfeatures : 20 
## evaluation_log:
##     iter train_rmse
##        1   0.844626
##        2   0.634427
## ---                
##       49   0.259093
##       50   0.258834

Predicting test data

# The model is ready and we can predict our test data.
xbgPreds = predict(xgbModel, xgb.test)

# Now, we'll convert the result into factor level type.
xbgPreds[(xbgPreds > 2)] = 2
pred.label = as.factor((levels(test.label))[round(xbgPreds)])

Predict New Outcomes

# check the prediction accuracy with a confusion matrix.
conf_mtrx = confusionMatrix(test.label, pred.label)
print(conf_mtrx)

## Confusion Matrix and Statistics
## 
##                          Reference
## Prediction                neutral or dissatisfied satisfied
##   neutral or dissatisfied                    9264      1025
##   satisfied                                  1007     11433
##                                                  
##                Accuracy : 0.9106                 
##                  95% CI : (0.9068, 0.9143)       
##     No Information Rate : 0.5481                 
##     P-Value [Acc > NIR] : <2e-16                 
##                                                  
##                   Kappa : 0.8196                 
##                                                  
##  Mcnemar's Test P-Value : 0.7061                 
##                                                  
##             Sensitivity : 0.9020                 
##             Specificity : 0.9177                 
##          Pos Pred Value : 0.9004                 
##          Neg Pred Value : 0.9191                 
##              Prevalence : 0.4519                 
##          Detection Rate : 0.4076                 
##    Detection Prevalence : 0.4527                 
##       Balanced Accuracy : 0.9098                 
##                                                  
##        'Positive' Class : neutral or dissatisfied
##

xgBoost Model

xgBoost Model

xgBoost Model

D. Vuong

07 July 2020

Label Conversion and Convert dataset to Dataframe

Split the data for training and testing (75/25 split)

Create the xgb.DMatrix objects

Define the model

Predicting test data

Predict New Outcomes