Linear Discriminant Analysis

On this assignment, we will work on the football data, same as in the previous assignment. To do so, I create a new column called “Situation” and give to each team a new value which is : - “Podium” if the team is in the top three - “Relegation” if the team is in the bottom two - “Middle” otherwise.

Situation = rep(0,length(stats[,1]))

for (i in 1:length(stats$Clt)){
  if (stats$Clt[i] <= 3){
    Situation[i] = 'Podium'
  }
  else if (stats$Clt[i] > 10){
    Situation[i] = 'Relegation'
  }
  else {
    Situation[i] = 'Middle '
  }
}
table(Situation)
## Situation
##    Middle      Podium Relegation 
##         35         15         10
stats = cbind(stats,Situation)
data=stats[,c(1,2,9,10,11,12,15,17)]

On the other hand, I also did a clustering with the K-means methods to distinguish three class.

k_means = kmeans(data[,1:7], centers = 3)
#k_means
table(k_means$cluster)
## 
##  1  2  3 
## 33 14 13
table(data$Situation,k_means$cluster)
##             
##               1  2  3
##   Middle     25  2  8
##   Podium      0 12  3
##   Relegation  8  0  2
data$Class<-as.factor(k_means$cluster)

Now, we can produce some linear discriminant analysis on our dataset.

lda1 <- lda(Situation ~ ., data = data[,-c(7,9)])
lda2 <- lda(Class ~ ., data = data[,-c(8)])
lda1
## Call:
## lda(Situation ~ ., data = data[, -c(7, 9)])
## 
## Prior probabilities of groups:
##    Middle      Podium Relegation 
##  0.5833333  0.2500000  0.1666667 
## 
## Group means:
##                  JC      Age `Titu Min/Match`     Remp  Mn.Remp   RempNE
## Middle     22.14286 24.80571         84.77143 58.97143 19.34286 60.71429
## Podium     23.46667 25.76667         83.73333 62.66667 21.66667 59.46667
## Relegation 24.50000 24.31000         83.70000 59.80000 22.30000 59.50000
## 
## Coefficients of linear discriminants:
##                           LD1          LD2
## JC                0.305293315 -0.012646090
## Age              -0.370822172 -0.790585630
## `Titu Min/Match` -0.180326487 -0.006261055
## Remp             -0.034343618 -0.009458037
## Mn.Remp           0.093563999 -0.103976596
## RempNE           -0.005814275  0.013513704
## 
## Proportion of trace:
##    LD1    LD2 
## 0.5239 0.4761
lda2
## Call:
## lda(Class ~ ., data = data[, -c(8)])
## 
## Prior probabilities of groups:
##         1         2         3 
## 0.5500000 0.2333333 0.2166667 
## 
## Group means:
##         JC      Age `Titu Min/Match`     Remp  Mn.Remp   RempNE      Pts
## 1 22.36364 24.51212         84.60606 57.57576 19.78788 53.15152 19.36364
## 2 23.21429 25.65714         83.85714 63.07143 21.07143 52.50000 47.85714
## 3 23.76923 25.36154         84.15385 63.00000 21.30769 86.38462 31.76923
## 
## Coefficients of linear discriminants:
##                         LD1         LD2
## JC               0.10360530 -0.08326920
## Age              0.13933694 -0.07799307
## `Titu Min/Match` 0.88483834  0.37896285
## Remp             0.08815164  0.02803890
## Mn.Remp          0.30390503  0.13950079
## RempNE           0.16763957  0.01926390
## Pts              0.01369716 -0.09419301
## 
## Proportion of trace:
##    LD1    LD2 
## 0.8402 0.1598

Let’s analyse the two linear discriminant analysis. When we compare the proportion of trace of the two axes, the second model seems to better fit the dataset. Therefore, we will only analyse this one.

table(data$Class, data$Situation)
##    
##     Middle  Podium Relegation
##   1      25      0          8
##   2       2     12          0
##   3       8      3          2

We can see that the first class of the k-means method concerns mainly middle/bottom teams, the second class is mainly the top teams and the third class can be the middle/top teams.

tData <- cbind(as.matrix(data[, c(1:7)]) %*% lda2$scaling, data$Class)
m4 <- apply(tData[tData[,3] == 1, 1:2], 2, mean)
m6 <- apply(tData[tData[,3] == 2, 1:2], 2, mean)
m8 <- apply(tData[tData[,3] == 3, 1:2], 2, mean)

plot(tData[,2] ~ tData[,1], pch = 22, bg = c("red", "yellow", "green")[unclass(as.factor(data$Class))], xlab = "First LDA direction", ylab = "Second LDA direction",main = "Graphical representation on two axes")
points(m4[1], m4[2], cex = 2, pch = 21, bg = "red")
points(m6[1], m6[2], cex = 2, pch = 21, bg = "yellow")
points(m8[1], m8[2], cex = 2, pch = 21, bg = "green")
legend("topleft", inset = 0.02, pch = 19, legend = levels(data$Class), col =c("red", "yellow", "green"))

Visual outputs

During this assignment, I had an error with the “predict” function that I couldn’t fix. Therefore, I will provide only descriptive plots.

library("klaR")
partimat(as.factor(Situation) ~ JC + Age + Remp + Mn.Remp + RempNE, data = data, method = "lda")

scatterplotMatrix(data[,1:7])

Comparison with alternative classification approach

We can use some GLM models to classifiy and analyse our dataset. In this case, the multinomial approach seems the best fit to the data.

require(foreign)
require(nnet)
require(ggplot2)
require(reshape2)

reg = multinom(Situation ~., data=data[,-c(7,9)])
## # weights:  24 (14 variable)
## initial  value 65.916737 
## iter  10 value 46.719621
## iter  20 value 44.288644
## iter  30 value 44.195662
## iter  30 value 44.195662
## final  value 44.195662 
## converged
sum = summary(reg)

sum$coefficients
##            (Intercept)        JC        Age `Titu Min/Match`         Remp
## Podium       -15.71126 0.1469263  0.8875781       -0.1365690 -0.008229490
## Relegation    10.69874 0.4865931 -0.6278227       -0.1311163 -0.008365773
##              Mn.Remp       RempNE
## Podium     0.1227524 -0.023325653
## Relegation 0.1863258 -0.003842679