Proposed solution for Mall Customers analysis¶

Statistical Laboratory

Alessandro Ortis - University of Catania

Consider the Mall Customers Data Set. The dataset can be downloaded at the following URL:

http://www.dmi.unict.it/ortis/StatsLab/mall/

data = read.csv("Mall_Customers.csv")
names(data) <- c("CustomerID","Gender","Age","Annual_Income", "Spending_Score")
head(data)

summary(data$Age)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  18.00   28.75   36.00   38.85   49.00   70.00

summary(data$Gender)

summary(data$Annual_Income)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  15.00   41.50   61.50   60.56   78.00  137.00

summary(data$Spending_Score)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1.00   34.75   50.00   50.20   73.00   99.00

data_f = data[which(data$Gender == 'Female'),]
summary(data_f$Annual_Income)
summary(data_f$Spending_Score)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  16.00   39.75   60.00   59.25   77.25  126.00

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   5.00   35.00   50.00   51.53   73.00   99.00

data_m = data[which(data$Gender == 'Male'),]
summary(data_m$Annual_Income)
summary(data_m$Spending_Score)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  15.00   45.50   62.50   62.23   78.00  137.00

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1.00   24.50   50.00   48.51   70.00   97.00

plot(x = data$Annual_Income, y=data$Spending_Score, col=c(1:2)[data$Gender])

#data$Gender <- as.factor(data$Gender)
#plot(data$Annual_Income, data$Spending_Score, col=data$Gender)

# Income vs Spending Score
in_spen = data[,c(4,5)]

KM<-kmeans(in_spen,5)  
KM

K-means clustering with 5 clusters of sizes 81, 23, 22, 39, 35

Cluster means:
  Annual_Income Spending_Score
1      55.29630       49.51852
2      26.30435       20.91304
3      25.72727       79.36364
4      86.53846       82.12821
5      88.20000       17.11429

Clustering vector:
  [1] 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2
 [38] 3 2 3 2 3 2 1 2 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[112] 1 1 1 1 1 1 1 1 1 1 1 1 4 5 4 1 4 5 4 5 4 1 4 5 4 5 4 5 4 5 4 1 4 5 4 5 4
[149] 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5
[186] 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4

Within cluster sum of squares by cluster:
[1]  9875.111  5098.696  3519.455 13444.051 12511.143
 (between_SS / total_SS =  83.5 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"

c1 = data[which(KM$cluster == 1),]
head(c1)

plot(data$Annual_Income, data$Spending_Score, col=KM$cluster)

There are five distinct income-spending patterns.

c1 = data[which(KM$cluster == 1),]
c2 = data[which(KM$cluster == 2),]
c3 = data[which(KM$cluster == 3),]
c4 = data[which(KM$cluster == 4),]
c5 = data[which(KM$cluster == 5),]

# Alternatelively, I can extract these numbers from the KM variable
cluster_centroids <- data.frame(
        Spending_Score = c(mean(c1$Spending_Score), 
                           mean(c2$Spending_Score),
                           mean(c3$Spending_Score),
                           mean(c4$Spending_Score),
                           mean(c5$Spending_Score)),

        Annual_Income = c(mean(c1$Annual_Income), 
                          mean(c2$Annual_Income), 
                          mean(c3$Annual_Income), 
                          mean(c4$Annual_Income), 
                          mean(c5$Annual_Income)))
                          
row.names(cluster_centroids) <- c('C1','C2','C3','C4','C5')
cluster_centroids

Most interesting patterns are those related to people with high spending score and low budget, and viceversa.

# these two clusters may change at every kmeans() execution
hsli = c3 # high_spending_low_income
lshi = c5 # low_spending_high_income

summary(hsli)
summary(lshi)

   CustomerID       Gender        Age        Annual_Income   Spending_Score 
 Min.   : 2.00   Female:13   Min.   :18.00   Min.   :15.00   Min.   :61.00  
 1st Qu.:12.50   Male  : 9   1st Qu.:21.25   1st Qu.:19.25   1st Qu.:73.00  
 Median :23.00               Median :23.50   Median :24.50   Median :77.00  
 Mean   :23.09               Mean   :25.27   Mean   :25.73   Mean   :79.36  
 3rd Qu.:33.50               3rd Qu.:29.75   3rd Qu.:32.25   3rd Qu.:85.75  
 Max.   :46.00               Max.   :35.00   Max.   :39.00   Max.   :99.00

   CustomerID       Gender        Age        Annual_Income   Spending_Score 
 Min.   :125.0   Female:16   Min.   :19.00   Min.   : 70.0   Min.   : 1.00  
 1st Qu.:148.0   Male  :19   1st Qu.:34.00   1st Qu.: 77.5   1st Qu.:10.00  
 Median :165.0               Median :42.00   Median : 85.0   Median :16.00  
 Mean   :164.4               Mean   :41.11   Mean   : 88.2   Mean   :17.11  
 3rd Qu.:182.0               3rd Qu.:47.50   3rd Qu.: 97.5   3rd Qu.:23.50  
 Max.   :199.0               Max.   :59.00   Max.   :137.0   Max.   :39.00

The median of age is higher for the group of customers who has high incomes and low spending scores. The gender is balanced.

interesting_customers = data.frame(rbind(hsli,lshi))

age_vs_spending = data.frame(Age = interesting_customers$Age,
                             SpScore = interesting_customers$Spending_Score)

pairs(age_vs_spending, main = "Age/Spending", 
      pch = 21, bg = c("red", "green")[unclass(interesting_customers$Gender)])

cor(data$Age,data$Spending_Score)
cor(interesting_customers$Age,interesting_customers$Spending_Score)

There is not such a high linear correlation in the entire dataset. But it increases (in abs value) for this 'special' group of customers.

# check the gender... is not that interesting
#cluster = hsli
#counts <- table(cluster$Gender, cluster$Spending_Score)
#barplot(counts,
#        main="Spending score vs gender",
#        xlab="Spending score",
#        col=c("blue","red"),
#        legend = rownames(counts))

#cluster = interesting_customers
#counts <- table(cluster$Gender, cluster$Spending_Score)
#barplot(counts,
#        main="Spending score vs gender",
#        xlab="Spending score",
#        col=c("blue","red"),
#        legend = rownames(counts))

Are those interesting customers related to their age ?

par(mfrow=c(2,2))    # set the plotting area into a 2*2 array

hist(hsli$Age, freq=FALSE)
lines(density(hsli$Age), col="red")
hist(lshi$Age, freq=FALSE)
lines(density(lshi$Age), col="red")


boxplot(hsli$Age)
boxplot(lshi$Age)

par(mfrow=c(1,2))    
boxplot(data$Age)
boxplot(data$Spending_Score)

Both distributions are slightly skewed toward the lower bound.

summary(data$Age)
summary(data$Spending_Score)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  18.00   28.75   36.00   38.85   49.00   70.00

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1.00   34.75   50.00   50.20   73.00   99.00

age_med = median(data$Age)
sps_med = median(data$Spending_Score)

Now, we may thing that the age can be related to the spending score. Let's check with a X-squared test.

# young low spending
yls = nrow(data[which((interesting_customers$Age<age_med)&(interesting_customers$Spending_Score<sps_med)),])
# young high spending  
yhs = nrow(data[which((interesting_customers$Age<age_med)&(interesting_customers$Spending_Score>=sps_med)),])
# old low spending
ols = nrow(data[which((interesting_customers$Age>=age_med)&(interesting_customers$Spending_Score<sps_med)),])
# old high spending
ohs = nrow(data[which((interesting_customers$Age>=age_med)&(interesting_customers$Spending_Score>=sps_med)),])

c(yls,ols,yhs,ohs)

observed_table <- matrix(c(yls, ols,
                           yhs, ohs), nrow = 2, ncol = 2, byrow = T)
colnames(observed_table) <- c('Young', 'Old')
rownames(observed_table) <- c('LowSpending', 'HghSpending')
observed_table
rowSums(observed_table)
colSums(observed_table)

X <- chisq.test(observed_table)
X
X$expected

	Pearson's Chi-squared test with Yates' continuity correction

data:  observed_table
X-squared = 25.165, df = 1, p-value = 5.262e-07

From the above result, we can observe that the p-value is less than the significance level (0.05). Hence, we can reject the null hypothesis and conclude that the two variables are not independent.

Therefore, in these two groups there is a high relationship between age and spending score. Indeed, there are no old people in the group of high spending customers. Now, repeat it by considering the median values of the interesting customers only, as we considered the median values computed to the whole dataset.

int_age_med = median(interesting_customers$Age)
int_sps_med = median(interesting_customers$Spending_Score)

# young low spending
yls = nrow(data[which((interesting_customers$Age<int_age_med)&(interesting_customers$Spending_Score<int_sps_med)),])
# young high spending  
yhs = nrow(data[which((interesting_customers$Age<int_age_med)&(interesting_customers$Spending_Score>=int_sps_med)),])
# old low spending
ols = nrow(data[which((interesting_customers$Age>=int_age_med)&(interesting_customers$Spending_Score<int_sps_med)),])
# old high spending
ohs = nrow(data[which((interesting_customers$Age>=int_age_med)&(interesting_customers$Spending_Score>=int_sps_med)),])

c(yls,ols,yhs,ohs)

observed_table <- matrix(c(yls, ols,
                           yhs, ohs), nrow = 2, ncol = 2, byrow = T)
colnames(observed_table) <- c('Young', 'Old')
rownames(observed_table) <- c('LowSpending', 'HghSpending')
observed_table
rowSums(observed_table)
colSums(observed_table)

X <- chisq.test(observed_table)
X
X$expected

	Pearson's Chi-squared test with Yates' continuity correction

data:  observed_table
X-squared = 11.131, df = 1, p-value = 0.0008489

In this case the p-value indicates that there is a 0.085% of prob. that the two variables are independent and to observe such statstic value. Such value is still lower a 5% significance level, but is not that low as we are used to observe when data are strongly realated. This may be due to the low number of samples (57), as we know that the X-squared test is affected by the number of samples.

So far, the test suggested that in the two interesting customers groups there may be a relationship between the spending score and the age of the customers. Now, repeat the X-squared test to assess, with more data, if this relationship may be extended to the general customers of the Mall.

# young low spending
yls = nrow(data[which((data$Age<age_med)&(data$Spending_Score<sps_med)),])
# young high spending  
yhs = nrow(data[which((data$Age<age_med)&(data$Spending_Score>=sps_med)),])
# old low spending
ols = nrow(data[which((data$Age>=age_med)&(data$Spending_Score<sps_med)),])
# old low spending
ohs = nrow(data[which((data$Age>=age_med)&(data$Spending_Score>=sps_med)),])

c(yls,ols,yhs,ohs)

observed_table <- matrix(c(yls, ols,
                           yhs, ohs), nrow = 2, ncol = 2, byrow = T)
colnames(observed_table) <- c('Young', 'Old')
rownames(observed_table) <- c('LowSpending', 'HghSpending')
observed_table
rowSums(observed_table)
colSums(observed_table)
X <- chisq.test(observed_table)
X

	Pearson's Chi-squared test with Yates' continuity correction

data:  observed_table
X-squared = 27.461, df = 1, p-value = 1.603e-07

X$expected

From the above result, we can observe that the p-value is less than the significance level (0.05). Hence, we can reject the null hypothesis and conclude that the two variables are not independent.

To further assess that spending score is not related to the gender, we can repeat the test with the gender counts in the contingency table.

# male low spending
mls = nrow(data[which((data$Gender=='Male')&(data$Spending_Score<sps_med)),])
# male high spending  
mhs = nrow(data[which((data$Gender=='Male')&(data$Spending_Score>=sps_med)),])
# female low spending
fls = nrow(data[which((data$Gender=='Female')&(data$Spending_Score<sps_med)),])
# female high spending
fhs = nrow(data[which((data$Gender=='Female')&(data$Spending_Score>=sps_med)),])

c(mls,mhs,fls,fhs)

observed_table <- matrix(c(mls, mhs,
                           fls, fhs), nrow = 2, ncol = 2, byrow = T)
colnames(observed_table) <- c('Male', 'Female')
rownames(observed_table) <- c('LowSpending', 'HghSpending')
observed_table
rowSums(observed_table)
colSums(observed_table)

X <- chisq.test(observed_table)
X

	Pearson's Chi-squared test with Yates' continuity correction

data:  observed_table
X-squared = 0.011725, df = 1, p-value = 0.9138

new_data = data.frame(data, Young =data$Age<median(data$Age) )

plot(x = new_data$Annual_Income, y=new_data$Spending_Score, pch = 21, bg=c("Red", "Blue")[as.integer(new_data$Young)+1])

The plot shows that in the group of low spending and low income the most of people are old. Moreover, the younger customers are mostly distributed in the higher side of spanding score, regardless their incomes.

the p-value is high: >91% to observe the statistic in the case of independence (H0).

Summarize the X-squared test results:

In the interesting customer group (57 samples), Age and spending behaviour are related, indeed, we obtained a p-value of 0.085% (medians computed on the subset)
In the interesting customer group (57 samples), Age and spending behaviour are related, indeed, we obtained a p-value of 0.00005% (medians computed on the whole dataset)
In the whole dataset (200 samples), Age and spending behaviour are related, indeed, we obtained a p-value of 0.000016%
In the whole dataset (200 samples), Gender and spending behaviour are not related, indeed, we obtained a p-value of 91%
Young customers are used to spend money, regardless their incomes. Indeed, a very few customers that have low spending scores are young.

Concluson: the Mall manager may decide to include more services dedicated to old customers, regardless their gender, mantaining what attract young customers.

CustomerID	Gender	Age	Annual_Income	Spending_Score
1	Male	19	15	39
2	Male	21	15	81
3	Female	20	16	6
4	Female	23	16	77
5	Female	31	17	40
6	Female	22	17	76

	Spending_Score	Annual_Income
C1	49.51852	55.29630
C2	20.91304	26.30435
C3	79.36364	25.72727
C4	82.12821	86.53846
C5	17.11429	88.20000

	Young	Old
LowSpending	10	25
HghSpending	22	0

	Young	Old
LowSpending	19.64912	15.350877
HghSpending	12.35088	9.649123

	Young	Old
LowSpending	6	22
HghSpending	20	9

	Young	Old
LowSpending	12.77193	15.22807
HghSpending	13.22807	15.77193