Analysis Of Sample Kurtosis

0001-01-01 R

Kurtosis

We define a function for the sample-size-standardised excess kurtosis.

kurtosis <- function(x) {
  xstand <- x - mean(x)
  kurt <- (sum((xstand)^4))/((sum((xstand)^2))^2)-3/length(x)
  return(kurt)
}

Generate samples

We select random sample sizes and generate a matrix of samples arranged in a list.

We are generating samples from the Stable distribution using the stabledist package.

The index parameter is chosen randomly between 1 and 2. The skewness parameter is set to zero.

n <- 500
minsize <- 20
maxsize <- 1500
samplesizes <- ceiling(runif(n,minsize,maxsize))
alphas <- runif(n,1,2)
library(stabledist)
samples <- mapply(rstable,samplesizes,alphas,rep(0,n))

We used 500 samples of size ranging from 20 to 1500.

Calculate Kurtosis

We use parallel processing to speed up the computation.

library(parallel)
cl <- makeCluster(max(1,floor(0.75*detectCores(logical = FALSE)))) # Select number of cores automatically
nul <- clusterExport(cl,c('kurtosis'))
outputs <- parLapply(cl,samples,kurtosis)
stopCluster(cl)

Graph

We draw a graph of the results.

kurtosis_random <- unlist(outputs)*samplesizes
plot(alphas,kurtosis_random,pch='*',col='black',main='', xlab = expression(alpha), ylab = 'excess kurtosis', log = 'y')
grid()

Graph for single sample size

n <- 1000
samplesize <- 500
alphas <- runif(n,1,2)
samples <- sapply(alphas, function(x) {kurtosis(rstable(samplesize,x,0))})*samplesize + 3
plot(alphas,samples,pch=4,col='blue',main=paste('Sample kurtosis for sample size',samplesize), xlab = expression(alpha), ylab = 'Kurtosis', log = 'y')
grid()

Simulation study

Instead of random values of \(n\) and \(\alpha\), we now use a fixed selection and do a larger scale run.

nsamplespersize <- 25000
minsize <- 50
maxsize <- 500
samplesizes <- seq(minsize,maxsize,minsize)
nsamplesizes <- length(samplesizes)
alphas <- seq(1,2,0.5)
nalphas <- length(alphas)
params <- data.frame(SampleSize=rep(samplesizes,each=nalphas*nsamplespersize), 
                     Alpha=rep(alphas,each=nsamplespersize,times=nsamplesizes),
                     Sample=rep(1:nsamplespersize,times=nsamplesizes*nalphas))
samplekurtosis <- function(pars) {
  n <- pars[1]; a <- pars[2]
  x <- rstable(n,a,0)
  xstand <- x - mean(x)
  kurt <- (sum((xstand)^4))/((sum((xstand)^2))^2)-3/n
}

cl <- makeCluster(max(1,floor(0.75*detectCores(logical = FALSE))))
nul <- clusterEvalQ(cl,{suppressPackageStartupMessages(library(stabledist,warn.conflicts=FALSE,quietly=TRUE))})
nul <- clusterExport(cl,c('samplekurtosis'))
outputs <- parRapply(cl,params,samplekurtosis)
stopCluster(cl)
results <- data.frame(params,g_over_n=outputs)
saveRDS(results,file=paste('Kurtosis',gsub(':','_',date()),'.Rds',sep=''))

Evaluation of simulation study

library(viridisLite)
cols = viridis(nalphas*2)
g <- results$g_over_n*results$SampleSize
x <- c(0,samplesizes)
tp <- ceiling(0.8*length(samplesizes))
plot(c(0,max(samplesizes)),c(0,max(samplesizes)/2),type='n',xlab = 'Sample size', ylab = 'Excess kurtosis', main='')
grid()
for (a in 1:nalphas) {
  al <- alphas[a]
  ga <- g[results$Alpha==al]
  gg <- results$SampleSize[results$Alpha==al]
  ymean <- tapply(ga,gg,mean)
  points(samplesizes, ymean, col=cols[a], pch=4)
  # lower <- c(0,tapply(ga,gg,quantile,0.05))
  # upper <- c(0,tapply(ga,gg,quantile,0.95))
  # lines(x, lower, col=cols[a], lwd=1, lty=2)
  # lines(x, upper, col=cols[a], lwd=1, lty=2)
  lines(x, c(0,ymean), col=cols[a], lwd=2, lty=a)
  text(x[tp],ymean[tp],bquote(alpha==.(al)),col=cols[a],pos=3, offset = (a-1)/nalphas)
  print(coef(lm(ymean~samplesizes)))
}

## (Intercept) samplesizes 
##  -3.5022056   0.5012885 
## (Intercept) samplesizes 
##  -1.3721751   0.2474155 
##   (Intercept)   samplesizes 
## -0.0825938600  0.0001743067

Simulation Study Alternative: t density

nsamplespersize <- 25000
minsize <- 50
maxsize <- 500
samplesizes <- seq(minsize,maxsize,minsize)
nsamplesizes <- length(samplesizes)
dfs <- c(3,4,5)
ndfs <- length(dfs)
params <- data.frame(SampleSize=rep(samplesizes,each=ndfs*nsamplespersize), 
                     DegFreedom=rep(dfs,each=nsamplespersize,times=nsamplesizes),
                     Sample=rep(1:nsamplespersize,times=nsamplesizes*ndfs))
samplekurtosis_t <- function(pars) {
  n <- pars[1]; df <- pars[2]
  x <- rt(n,df)
  xstand <- x - mean(x)
  kurt <- (sum((xstand)^4))/((sum((xstand)^2))^2)-3/n
  return(kurt)
}

library(parallel)
cl <- makeCluster(max(1,floor(0.75*detectCores(logical = FALSE))))
nul <- clusterExport(cl,c('samplekurtosis_t'))
outputs <- parRapply(cl,params,samplekurtosis_t)
stopCluster(cl)
results <- data.frame(params,g_over_n=outputs)
saveRDS(results,file=paste('Kurtosis_t',gsub(':','_',date()),'.Rds',sep=''))

Evaluation of simulation study

We are interested in the distribution of \(g_2/n\) for various values of \(\alpha\) first, assuming it is stable over \(n\).

cols = viridis(ndfs*2)
g <- results$g_over_n*results$SampleSize
x <- c(0,samplesizes)
tp <- ceiling(0.8*length(samplesizes))
plot(c(0,max(samplesizes)),c(0,max(samplesizes)/22),type='n',xlab = 'Sample Size', ylab = 'Excess kurtosis', main='')
grid()
for (a in 1:ndfs) {
  al <- dfs[a]
  ga <- g[results$DegFreedom==dfs[a]]
  gg <- results$SampleSize[results$DegFreedom==dfs[a]]
  ymean <- tapply(ga,gg,mean)
  points(samplesizes, ymean, col=cols[a], pch=4)
  lines(x, c(0,ymean), col=cols[a], lwd=2, lty=a)
  text(x[tp],ymean[tp],bquote("v ="~.(al)),col=cols[a],pos=3, offset = (a-1)/ndfs/2)
  print(coef(lm(ymean~samplesizes)))
}

## (Intercept) samplesizes 
##  4.05946508  0.03307267 
## (Intercept) samplesizes 
##  2.80506557  0.01056211 
## (Intercept) samplesizes 
## 1.936158700 0.004580672

Example: Stock Exchange Data

We begin by loading and displaying the New York Stock Exchange index values and related log returns. The data we used is available here.

mydata <- read.csv('nyexchange.csv')
X <- ts(mydata$indexvalue, start = 2013.33, frequency = 251)
lX <- log(X)
lret <- diff(lX)

par(mfrow=c(2,1), mar=c(2.2,4.2,1.4,0.2))
cols=viridis(3)
plot(X, main = '', ylab = 'Index value', col=cols[1])
grid()
plot(lret, main='', ylab = 'Log return', col=cols[2])
grid()

We notice some dramatic movements at places.

Let’s look at the autocorrelations

source('autocor.r')
autocor(lret)

##            Autocor Partial Autocor
##  [1,] -0.015326596    -0.015326596
##  [2,] -0.062281851    -0.062531445
##  [3,]  0.033896147     0.032056340
##  [4,] -0.040650529    -0.043763370
##  [5,] -0.023600168    -0.020789966
##  [6,]  0.001727473    -0.005323083
##  [7,]  0.012223911     0.012174574
##  [8,] -0.001348682    -0.001627343
##  [9,] -0.060966241    -0.061690430
## [10,] -0.021055323    -0.024615881
## [11,] -0.011559290    -0.019179503
## [12,]  0.013130875     0.014208423

Box.test(lret,24,'Ljung-Box')

## 
##  Box-Ljung test
## 
## data:  lret
## X-squared = 32.845, df = 24, p-value = 0.1074

sqlret <- scale(lret)^2
autocor(sqlret,12)

##          Autocor Partial Autocor
##  [1,] 0.27671624    0.2767162398
##  [2,] 0.23497845    0.1715418545
##  [3,] 0.29496376    0.2160401858
##  [4,] 0.22547585    0.0940220313
##  [5,] 0.09085276   -0.0626345435
##  [6,] 0.12677680    0.0179938520
##  [7,] 0.08865174   -0.0120471599
##  [8,] 0.08999092    0.0407826539
##  [9,] 0.11403230    0.0680426212
## [10,] 0.11117521    0.0455413827
## [11,] 0.07659700    0.0006617388
## [12,] 0.04947875   -0.0397080496

Box.test(sqlret,12,'Ljung-Box')

## 
##  Box-Ljung test
## 
## data:  sqlret
## X-squared = 434.18, df = 12, p-value < 0.00000000000000022

The Box test on the log returns fails to reject white noise for any number of lags from 1 to 36.

The Box test on the squared log returns does reject thoroughly, so we must conclude that autocorrelation exists in the variance only.

For this analysis we are only interested in the data as an uncorrelated sample, so let us consider the histogram instead.

hist(lret, 40, main='', xlab='log return', density = 30, col = cols[1])

Rolling window calculations

We first calculate kurtosis, then estimates of alpha, for various sizes of rolling window.

library(StableEstim)
print(mainPars <- IGParametersEstim(as.numeric(lret)))

##         alpha          beta         gamma         delta 
##  1.7555602623 -0.3500208698  0.0038817309  0.0005086713

T <- length(lret)
ws <- seq(100,500,200)
nws <- length(ws)

sta_est_1 <- sta_est_2 <- sta_est_3 <- sta_est_4 <- alphases <- kurts <- skurts <- vector('list',nws)
for (i in 1:nws) {
  starts <- 1:(T-ws[i]+1)
  skurts[[i]] <- skurt <- sapply(starts, function(x) { kurtosis(lret[x:(x+ws[i]-1)]) } )
  kurts[[i]] <- skurt*ws[i]
  alphases[[i]] <- (1-skurt)*2
  cat('\n\n Starting Kout for window size',ws[i],'\n')
  sta_est_4[[i]] <- sapply(starts, function(x) { Estim(EstimMethod='Kout',data=as.numeric(lret[x:(x+ws[i]-1)]))@par[1] } )
  cat('Now Kogon-McCulloch \n')
  sta_est_1[[i]] <- sapply(starts, function(x) { IGParametersEstim(as.numeric(lret[x:(x+ws[i]-1)]))[1] } )
  cat('Now Cgmm\n')
  sta_est_3[[i]] <- sapply(starts, function(x) { Estim(EstimMethod='Cgmm',data=as.numeric(lret[x:(x+ws[i]-1)]))@par[1] } )
  cat('Now GMM\n')
  sta_est_2[[i]] <- sapply(starts, function(x) { Estim(EstimMethod='GMM',data=as.numeric(lret[x:(x+ws[i]-1)]))@par[1] } )
}
windowresults <- list(sta_est_1=sta_est_1, sta_est_2=sta_est_2, sta_est_3=sta_est_3, sta_est_4=sta_est_4, alphases=alphases, kurts=kurts, skurts=skurts)
saveRDS(windowresults,file=paste('NewYorkSErollingWindows',gsub(':','_',date()),'.Rds',sep=''))

We considered window sizes of 100, 300, 500.

And plot various things.

plot(c(1,1),c(2,2),type = 'n',xlab = 'Window rightmost point', ylab = 'alpha estimate', main = '', xlim = c(2013.33,(2013.33+T/251)), ylim = c(1.39,2.01))
cols <- viridis(nws*2+1)
for (i in 2) {
  w <- ws[i];  st <- w; times <- st:(T-w+st)
  lines(2013.33+(times-1)/251,windowresults$alphases[[i]],col=cols[2], lwd=1, lty=(5))
  lines(2013.33+(times-1)/251,windowresults$sta_est_1[[i]],col=cols[4], lwd=1, lty=(1))
  # lines(2013.33+(times-1)/251,windowresults$sta_est_2[[i]],col=cols[3], lwd=1, lty=3)
  # lines(2013.33+(times-1)/251,windowresults$sta_est_3[[i]],col=cols[4], lwd=1, lty=4)
  # lines(2013.33+(times-1)/251,windowresults$sta_est_4[[i]],col=cols[5], lwd=1, lty=5)
}
grid()
legend('bottomleft',legend=paste(paste('Window Size',ws[2]),c('Kogon-McCulloch','Kurtosis')),col=cols[c(4,2)],lwd=1,lty=c(1,5))

# legend('bottomleft',legend=c(ws,'Kurtosis','Kogon-McCulloch','GMM','Cgmm','Kout'),col=c(cols[1:nws],rep(cols[nws+1],5)),lwd=1,lty=c(rep(1,nws),1:5))
#legend('bottomleft',legend=paste(paste('Window Size',ws),rep(c('Kogon-McCulloch','Kurtosis'),each=nws)),col=cols[c(nws:1,(nws*2):(nws+1))],lwd=1,lty=c(nws:1,(nws*2):(nws+1)))

plot(c(1,1),c(2,2),type = 'n',xlab = 'Window rightmost point', ylab = 'Standardised excess kurtosis estimate', main = '', xlim = c(2013.33,(2013.33+T/251)), ylim = c(0,0.25))
cols <- viridis(nws+1)
for (i in 1:nws) {
  w <- ws[i];  st <- w; times <- st:(T-w+st)
  lines(2013.33+(times-1)/251,windowresults$skurts[[i]],col=cols[nws+1-i], lwd=2, lty=(nws+1-i))
}
grid()
legend('topleft',legend=paste('Window size',ws),col=cols[nws:1],lwd=2,lty=nws:1)

plot(c(1,1),c(2,2),type = 'n',xlab = 'Window rightmost point', ylab = 'kurtosis estimate', main = '', xlim = c(2013.33,(2013.33+T/251)), ylim = c(0,T/50))
cols <- viridis(nws+1)
for (i in 1:nws) {
  w <- ws[i];  st <- w; times <- st:(T-w+st)
  lines(2013.33+(times-1)/251,windowresults$kurts[[i]],col=cols[nws+1-i], lwd=2, lty=(nws+1-i))
}
grid()
legend('topleft',legend=paste('Window size',ws),col=cols[nws:1],lwd=2,lty=nws:1)

Here we plot the kurtosis using increasing windows, always starting from the first observation and ending at the time on the x-axis.

plot(c(1,1),c(2,2),type = 'n',xlab = 'Date', ylab = 'Excess kurtosis estimate', main = '', xlim = c(2013.33,(2013.33+T/251)), ylim = c(-0.5,4))
cols <- viridis(3)
x2 <- 21:T
kurt <- sapply(x2, function(x) { kurtosis(lret[1:x])*x } )
x <- 2013.33+(x2-1)/251
lines(x,kurt,col=cols[1], lwd=2, lty=1)
grid()
newyorkmdl <- lm(kurt~x)
abline(newyorkmdl, col=cols[2], lwd=1, lty=2)

(beta <- coef(lm(kurt~x2))[2])

##          x2 
## 0.002263121

(alphaestnewyork <- (1-beta)*2)

##       x2 
## 1.995474

Many alphas

ss <- 500
numsmpls <- 5000
alphas <- seq(1,1.95,0.05)
nalphas <- length(alphas)
alphalongvector <- matrix(rep(alphas,each=numsmpls))
samplekurtosisfixed <- function(a) {
  x <- rstable(ss,a,0)
  xstand <- x - mean(x)
  kurt <- (sum((xstand)^4))/((sum((xstand)^2))^2)-3/ss
}

cl <- makeCluster(floor(0.75*detectCores(logical = FALSE)))
nul <- clusterEvalQ(cl,{suppressPackageStartupMessages(library(stabledist,warn.conflicts=FALSE,quietly=TRUE))})
nul <- clusterExport(cl,c('samplekurtosisfixed','ss'))
outputs <- parRapply(cl,alphalongvector,samplekurtosisfixed)
stopCluster(cl)
results <- data.frame(alpha=alphalongvector,g_over_n=outputs)
saveRDS(results,file=paste('Kurtosisfixed',gsub(':','_',date()),'.Rds',sep=''))

cols <- viridis(5)
ratio <- results[,2]/(2-results[,1])
values <- unlist(tapply(ratio,factor(alphalongvector),mean))
modl2 <- lm(values~alphas)
preds <- predict(modl2,interval = 'confidence')
alldata <- data.frame(ratio=ratio, alpha=alphalongvector)
modl3 <- lm(ratio~alpha, data=alldata)
preds3 <- predict(modl3,newdata=data.frame(ratio=rep(NA,length(alphas)), alpha=alphas),interval = 'confidence')

plot(2-alphas,values,type='p',xlab = expression(2-alpha), ylab = 'Estimated slope', main='', pch=4, col=cols[1],ylim = c(0.4,0.6))
grid()
lines(c(0,1.2),c(0.5,0.5),col=cols[4], lwd=1, lty=1)
lines(2-alphas,preds3[,1], col=cols[2], lwd=1, lty=2)
lines(2-alphas,preds3[,2], col=cols[2], lwd=1, lty=3)
lines(2-alphas,preds3[,3], col=cols[2], lwd=1, lty=3)
legend('topright',legend = c('Means','Proposal','Regression','95% interval'), col=cols[c(1,4,2,2)], lwd = c(NA,1,1,1,1), lty=c(NA,1,2,3,3), pch=c(4,NA,NA,NA,NA))

vars <- matrix(0,nalphas,3)
for (a in 1:nalphas) {
  currkurt <- results[alphalongvector==alphas[a],2]
  currmat <- matrix(currkurt,100)
  sds <- apply(currmat,2,sd)
  vars[a,1] <- mean(sds)
  vars[a,2:3] <- quantile(sds,c(0.05,0.95))
}

plot(c(1,2),c(0,0.4),type='n',xlab = expression(alpha), ylab = 'Standard deviation', main='')
grid()
lines(c(alphas,2),c(vars[,1],0),col=cols[1], lwd=2, lty=1)
lines(c(alphas,2),c(vars[,2],0),col=cols[2], lwd=1, lty=2)
lines(c(alphas,2),c(vars[,3],0),col=cols[2], lwd=1, lty=2)
legend('topright',legend = c('Average SD','90% interval'), col=cols[c(1,2)], lwd = c(2,1), lty=c(1,2))

Extending the regression lines to more alpha options

nsamplespersize <- 5000
minsize <- 50
maxsize <- 500
samplesizes <- seq(minsize,maxsize,minsize)
nsamplesizes <- length(samplesizes)
alphas <- seq(1,2,0.1)
nalphas <- length(alphas)
params <- data.frame(SampleSize=rep(samplesizes,each=nalphas*nsamplespersize), 
                     Alpha=rep(alphas,each=nsamplespersize,times=nsamplesizes),
                     Sample=rep(1:nsamplespersize,times=nsamplesizes*nalphas))
samplekurtosis <- function(pars) {
  n <- pars[1]; a <- pars[2]
  x <- rstable(n,a,0)
  xstand <- x - mean(x)
  kurt <- (sum((xstand)^4))/((sum((xstand)^2))^2)-3/n
}

library(parallel)
cl <- makeCluster(max(1,floor(0.75*detectCores(logical = FALSE))))
nul <- clusterEvalQ(cl,{suppressPackageStartupMessages(library(stabledist,warn.conflicts=FALSE,quietly=TRUE))})
nul <- clusterExport(cl,c('samplekurtosis'))
outputs <- parRapply(cl,params,samplekurtosis)
stopCluster(cl)
results <- data.frame(params,g_over_n=outputs)
saveRDS(results,file=paste('Kurtosis',gsub(':','_',date()),'.Rds',sep=''))

cols = viridis(nalphas*2)
g <- results$g_over_n*results$SampleSize
slopes <- rep(0,nalphas)
for (a in 1:nalphas) {
  al <- alphas[a]
  ga <- g[results$Alpha==al]
  gg <- results$SampleSize[results$Alpha==al]
  ymean <- tapply(ga,gg,mean)
  slopes[a] <- coef(lm(ymean~samplesizes-1))[1]
}

plot(c(0,1),c(0,0.5),type='n',xlab = expression(2-alpha), ylab = 'Estimated slope', main='')
grid()
points(2-alphas, slopes, col=cols[1], pch=4)
lines(2-alphas, slopes, col=cols[1], lwd=2, lty=1)

simulation study

Sean van der Merwe

Coordinator of UFS Statistical Consultation Unit

Statistician