Libraries

# calculations
library(geomorph) # GPA 
library(shapes) # OPA
library(Morpho) # CVA
library(phangorn) # UPGMA
library(geosphere) # distm

# plotting and visualization
library(ggplot2) # plots
ggplot2::theme_set(theme_light())
library(ggforce) # geom_ellipse
library(mgcViz) # GAM visualization
library(viridis) # plot scale
library(rnaturalearth) # maps

Abbreviations

AT - Austria
ES - Spain
GR - Greece
HR - Croatia
HU - Hungary
MD - Moldova
ME - Montenegro
PL - Poland
PT - Portugal
RO - Romania
RS - Serbia
SI - Slovenia
TR - Turkey

Landmark coordinates

Read raw coordinates of 19 landmarks from Zenodo (Oleksa et al., 2022)

wings <- read.csv("https://zenodo.org/record/7244070/files/EU-raw-coordinates.csv", header = TRUE)

# extract sample classifier
wings <- tidyr::separate(
    data = wings,
    col = file,
    sep = c(7), # sample name is in the first 7 characters of file name
    into = c("sample", NA),
    remove = FALSE
  ) 

# extract country classifier
wings <- tidyr::separate(
    data = wings,
    col = sample,
    sep = c(2), # country code is in the first 2 characters of sample name
    into = c("country", NA),
    remove = FALSE
  ) 
head(wings, 2)
##                          file  sample country  x1  y1  x2  y2  x3  y3  x4  y4
## 1 AT-0001-031-003678-L.dw.png AT-0001      AT 219 191 238 190 292 270 287 216
## 2 AT-0001-031-003678-R.dw.png AT-0001      AT 213 190 234 189 289 268 281 211
##    x5  y5  x6  y6  x7  y7  x8  y8  x9  y9 x10 y10 x11 y11 x12 y12 x13 y13 x14
## 1 294 131 359 276 409 307 387 284 431 251 401 228 450 197 456 153 471 121 487
## 2 289 129 359 274 408 304 388 281 434 249 404 226 447 194 452 151 467 119 482
##   y14 x15 y15 x16 y16 x17 y17 x18 y18 x19 y19
## 1 299 525 263 604 224 642 222 651 193  91 259
## 2 297 521 259 601 223 640 222 648 193  89 259

Geographic data

Read latitude and longitude.

geo.data <- read.csv("https://zenodo.org/record/7244070/files/EU-geo-data.csv", header = TRUE)

sample.geo.data <- aggregate(cbind(geo.data$latitude, geo.data$longitude), by = list(geo.data$sample), FUN = mean)
sample.geo.data <- reshape::rename(sample.geo.data, c(Group.1 = "sample", V1 = "latitude", V2 = "longitude")) 

# extract country classifier
sample.geo.data <- tidyr::separate(
    data = sample.geo.data,
    col = sample,
    sep = c(2), # country code is in the first 2 characters of sample name
    into = c("country", NA),
    remove = FALSE
  ) 
head(sample.geo.data, 2)
##    sample country latitude longitude
## 1 AT-0001      AT  47.0038  15.39831
## 2 AT-0002      AT  47.0038  15.39831

Map of sampling locations

world <- ne_countries(scale = "medium", returnclass = "sf")

# jitter the coordinates to show all locations
jitter.x <- jitter(sample.geo.data$longitude, amount = 0.2)
jitter.y <- jitter(sample.geo.data$latitude, amount = 0.2)

ggplot(data = world) +
  geom_sf() +
  geom_point(data = sample.geo.data, 
             aes(x = jitter.x, y = jitter.y, colour = country, shape = country), size = 1) +
  scale_color_manual(name ="Country", values = rainbow(13)) +
  scale_shape_manual(name ="Country", values = c(0:2,15:25)) +
  coord_sf(xlim = c(-11, 33), ylim = c(34, 56)) + 
  xlab("Longitude") + ylab("Latitude")

Variable names

In order to avoid mistakes it is safer to use columns names instead of indexes

p <- 19  # number of landmarks
k <- 2   # number of dimensions, in this case 2 for coordinates (x, y)

# create coordinates names used by IdentiFly
xyNames <- c("x1", "y1")
for (i in 2:p) {
  xyNames <- c(xyNames, paste0("x", i))
  xyNames <- c(xyNames, paste0("y", i))
}
xyNames
##  [1] "x1"  "y1"  "x2"  "y2"  "x3"  "y3"  "x4"  "y4"  "x5"  "y5"  "x6"  "y6" 
## [13] "x7"  "y7"  "x8"  "y8"  "x9"  "y9"  "x10" "y10" "x11" "y11" "x12" "y12"
## [25] "x13" "y13" "x14" "y14" "x15" "y15" "x16" "y16" "x17" "y17" "x18" "y18"
## [37] "x19" "y19"
# create coordinates names used by geomorph
xy.Names <- c("1.X", "1.Y")
for (i in 2:p) {
  xy.Names <- c(xy.Names, paste(i, "X", sep = "."))
  xy.Names <- c(xy.Names, paste(i, "Y", sep = "."))
}
xy.Names
##  [1] "1.X"  "1.Y"  "2.X"  "2.Y"  "3.X"  "3.Y"  "4.X"  "4.Y"  "5.X"  "5.Y" 
## [11] "6.X"  "6.Y"  "7.X"  "7.Y"  "8.X"  "8.Y"  "9.X"  "9.Y"  "10.X" "10.Y"
## [21] "11.X" "11.Y" "12.X" "12.Y" "13.X" "13.Y" "14.X" "14.Y" "15.X" "15.Y"
## [31] "16.X" "16.Y" "17.X" "17.Y" "18.X" "18.Y" "19.X" "19.Y"
# The number of principal components used is 2*p-4 = 34, which is equal to the degrees of freedom
# create principal components names used by prcomp
pcNames <- paste0("PC", 1:(2*p-4))
pcNames
##  [1] "PC1"  "PC2"  "PC3"  "PC4"  "PC5"  "PC6"  "PC7"  "PC8"  "PC9"  "PC10"
## [11] "PC11" "PC12" "PC13" "PC14" "PC15" "PC16" "PC17" "PC18" "PC19" "PC20"
## [21] "PC21" "PC22" "PC23" "PC24" "PC25" "PC26" "PC27" "PC28" "PC29" "PC30"
## [31] "PC31" "PC32" "PC33" "PC34"
# create principal components names used by geomorph
compNames <- paste0("Comp", 1:(2*p-4))
compNames
##  [1] "Comp1"  "Comp2"  "Comp3"  "Comp4"  "Comp5"  "Comp6"  "Comp7"  "Comp8" 
##  [9] "Comp9"  "Comp10" "Comp11" "Comp12" "Comp13" "Comp14" "Comp15" "Comp16"
## [17] "Comp17" "Comp18" "Comp19" "Comp20" "Comp21" "Comp22" "Comp23" "Comp24"
## [25] "Comp25" "Comp26" "Comp27" "Comp28" "Comp29" "Comp30" "Comp31" "Comp32"
## [33] "Comp33" "Comp34"
geoNames <- c("latitude", "longitude")

GPA-alignment

Before analysis all landmark configurations have to be superimposed.

# Convert 2D array into a 3D array
wings.coords <- arrayspecs(wings[xyNames], p, k)
dimnames(wings.coords)[[3]] <- wings$file

# Align the coordinates using Generalized Procrustes Analysis 
GPA <- gpagen(wings.coords, print.progress = FALSE)

# Convert 3D array into a 2D array - opposite to arrayspecs
wings.aligned <- two.d.array(GPA$coords) 
head(wings.aligned, 2)
##                                    1.X         1.Y        2.X         2.Y
## AT-0001-031-003678-L.dw.png -0.2789939 -0.05498483 -0.2503918 -0.05609707
## AT-0001-031-003678-R.dw.png -0.2837028 -0.05412563 -0.2521392 -0.05513094
##                                    3.X        3.Y        4.X         4.Y
## AT-0001-031-003678-L.dw.png -0.1708259 0.06535370 -0.1772270 -0.01597587
## AT-0001-031-003678-R.dw.png -0.1714110 0.06482066 -0.1820723 -0.02097479
##                                    5.X        5.Y         6.X        6.Y
## AT-0001-031-003678-L.dw.png -0.1649395 -0.1436844 -0.07016610 0.07576223
## AT-0001-031-003678-R.dw.png -0.1681155 -0.1439400 -0.06642008 0.07548815
##                                     7.X       7.Y         8.X        8.Y
## AT-0001-031-003678-L.dw.png 0.004405225 0.1234260 -0.02820789 0.08837556
## AT-0001-031-003678-R.dw.png 0.006464705 0.1217058 -0.02302708 0.08668925
##                                    9.X        9.Y         10.X        10.Y
## AT-0001-031-003678-L.dw.png 0.03865483 0.03964703 -0.005994006 0.004432599
## AT-0001-031-003678-R.dw.png 0.04681374 0.03971740  0.002302572 0.004465114
##                                   11.X        11.Y       12.X       12.Y
## AT-0001-031-003678-L.dw.png 0.06834561 -0.04118537 0.07827983 -0.1072430
## AT-0001-031-003678-R.dw.png 0.06764016 -0.04257738 0.07616792 -0.1070389
##                                   13.X       13.Y      14.X      14.Y      15.X
## AT-0001-031-003678-L.dw.png 0.10150398 -0.1550677 0.1218870 0.1130044 0.1797898
## AT-0001-031-003678-R.dw.png 0.09945379 -0.1547447 0.1177652 0.1129438 0.1772376
##                                   15.Y      16.X        16.Y      17.X
## AT-0001-031-003678-L.dw.png 0.05963885 0.2994255 0.002608403 0.3566238
## AT-0001-031-003678-R.dw.png 0.05679621 0.2982396 0.004620497 0.3568349
##                                     17.Y      18.X        18.Y       19.X
## AT-0001-031-003678-L.dw.png 0.0003842025 0.3707599 -0.04305200 -0.4729295
## AT-0001-031-003678-R.dw.png 0.0040404527 0.3695364 -0.03932548 -0.4715685
##                                   19.Y
## AT-0001-031-003678-L.dw.png 0.04465730
## AT-0001-031-003678-R.dw.png 0.04657064

Average aligned coordinates within samples

sample.aligned <- aggregate(wings.aligned, by = list(wings$sample), FUN = mean)
names(sample.aligned)[names(sample.aligned) == "Group.1"] <- "sample" # rename column

# extract country code as classifier
sample.aligned <- tidyr::separate(
    data = sample.aligned,
    col = sample,
    sep = c(2),
    into = c("country", NA),
    remove = FALSE
  ) 
head(sample.aligned, 2)
##    sample country        1.X         1.Y        2.X         2.Y        3.X
## 1 AT-0001      AT -0.2844528 -0.05838585 -0.2540030 -0.06055061 -0.1791217
## 2 AT-0002      AT -0.2841005 -0.05812160 -0.2560924 -0.06061664 -0.1754031
##          3.Y        4.X         4.Y        5.X        5.Y         6.X
## 1 0.06483904 -0.1812782 -0.01908254 -0.1660425 -0.1442186 -0.07292371
## 2 0.06517893 -0.1838003 -0.01870428 -0.1677755 -0.1437020 -0.07313915
##          6.Y         7.X       7.Y         8.X        8.Y        9.X        9.Y
## 1 0.07500076 0.011911497 0.1244321 -0.01547428 0.09046047 0.04583909 0.04084050
## 2 0.07616091 0.009677258 0.1229683 -0.01582622 0.09223084 0.04708244 0.04139449
##          10.X        10.Y       11.X        11.Y       12.X       12.Y
## 1 0.001339073 0.006503687 0.06921933 -0.04011336 0.07974063 -0.1025214
## 2 0.001699646 0.006111097 0.07075491 -0.03934851 0.07961461 -0.1036169
##        13.X       13.Y      14.X      14.Y     15.X       15.Y      16.X
## 1 0.1036491 -0.1503903 0.1187769 0.1142739 0.179305 0.05982758 0.2972238
## 2 0.1001694 -0.1478836 0.1202677 0.1129797 0.179610 0.06035730 0.3033193
##           16.Y      17.X         17.Y      18.X        18.Y       19.X
## 1 0.0017221934 0.3518606 -0.002184519 0.3653600 -0.04534943 -0.4709287
## 2 0.0007875057 0.3524397 -0.002419179 0.3614544 -0.04498977 -0.4699519
##         19.Y
## 1 0.04489640
## 2 0.04123325

Principal component analysis of the samples

# Convert 2D array into a 3D array
sample.3D <- arrayspecs(sample.aligned[xy.Names], p, k)
dimnames(sample.3D)[[3]] <- sample.aligned$sample

sample.pca <- gm.prcomp(sample.3D)
sample.pca.scores <- as.data.frame(sample.pca$x[ , compNames])
sample.pca.scores <- cbind(sample.geo.data, sample.pca.scores)

# create plot labels
variance.tab <- summary(sample.pca)$PC.summary
## 
## Ordination type: Principal Component Analysis 
## Centering by OLS mean
## Orthogonal projection of OLS residuals
## Number of observations: 1725 
## Number of vectors 34 
## 
## Importance of Components:
##                               Comp1        Comp2        Comp3        Comp4
## Eigenvalues            0.0001959343 3.293466e-05 2.336774e-05 0.0000168889
## Proportion of Variance 0.5120436334 8.606959e-02 6.106792e-02 0.0441364999
## Cumulative Proportion  0.5120436334 5.981132e-01 6.591811e-01 0.7033176414
##                               Comp5        Comp6        Comp7        Comp8
## Eigenvalues            1.453208e-05 1.257551e-05 1.053209e-05 9.666578e-06
## Proportion of Variance 3.797732e-02 3.286413e-02 2.752395e-02 2.526209e-02
## Cumulative Proportion  7.412950e-01 7.741591e-01 8.016830e-01 8.269451e-01
##                               Comp9       Comp10       Comp11       Comp12
## Eigenvalues            8.410956e-06 7.351429e-06 7.155121e-06 6.222810e-06
## Proportion of Variance 2.198072e-02 1.921181e-02 1.869879e-02 1.626234e-02
## Cumulative Proportion  8.489259e-01 8.681377e-01 8.868364e-01 9.030988e-01
##                              Comp13       Comp14       Comp15       Comp16
## Eigenvalues            5.628068e-06 4.604464e-06 3.768274e-06 2.899272e-06
## Proportion of Variance 1.470807e-02 1.203305e-02 9.847794e-03 7.576794e-03
## Cumulative Proportion  9.178069e-01 9.298399e-01 9.396877e-01 9.472645e-01
##                              Comp17       Comp18       Comp19       Comp20
## Eigenvalues            2.505382e-06 2.253050e-06 2.138021e-06 1.835601e-06
## Proportion of Variance 6.547424e-03 5.887994e-03 5.587382e-03 4.797056e-03
## Cumulative Proportion  9.538119e-01 9.596999e-01 9.652873e-01 9.700844e-01
##                              Comp21       Comp22       Comp23       Comp24
## Eigenvalues            1.671666e-06 1.485871e-06 1.303561e-06 1.196571e-06
## Proportion of Variance 4.368638e-03 3.883092e-03 3.406652e-03 3.127051e-03
## Cumulative Proportion  9.744530e-01 9.783361e-01 9.817427e-01 9.848698e-01
##                              Comp25       Comp26       Comp27       Comp28
## Eigenvalues            1.091115e-06 7.915753e-07 7.513697e-07 7.325161e-07
## Proportion of Variance 2.851457e-03 2.068658e-03 1.963587e-03 1.914316e-03
## Cumulative Proportion  9.877212e-01 9.897899e-01 9.917535e-01 9.936678e-01
##                              Comp29       Comp30       Comp31       Comp32
## Eigenvalues            6.351800e-07 5.476202e-07 4.278173e-07 3.188203e-07
## Proportion of Variance 1.659943e-03 1.431120e-03 1.118033e-03 8.331869e-04
## Cumulative Proportion  9.953277e-01 9.967589e-01 9.978769e-01 9.987101e-01
##                              Comp33       Comp34
## Eigenvalues            2.719570e-07 2.216305e-07
## Proportion of Variance 7.107170e-04 5.791966e-04
## Cumulative Proportion  9.994208e-01 1.000000e+00
variance <- variance.tab["Proportion of Variance", "Comp1"]
variance <- round(100 * variance, 1)
label.x <- paste0("PC1 (", variance, "%)")
variance <- variance.tab["Proportion of Variance", "Comp2"]
variance <- round(100 * variance, 1)
label.y <- paste0("PC2 (", variance, "%)")

ggplot(sample.pca.scores, aes(x = Comp1, y = Comp2, shape = sample.aligned$country, color = sample.aligned$country)) +
  geom_point() +
  scale_shape_manual(name ="Country", values = c(0:2,15:25)) +
  scale_color_manual(name ="Country", values = rainbow(13)) +
  stat_ellipse() + 
  xlab(label.x) + ylab(label.y)

Differences in wing shape

# define which landmarks are connected by lines in wireframe graph
link.x <- c(1, 1, 2, 2, 3,  3, 4,  4,  5, 6, 7,  7,  7, 8,  9,  9, 10, 11, 11, 12, 13, 14, 15, 16, 17)
link.y <- c(2, 3, 4, 5, 6, 19, 6, 10, 12, 8, 8, 14, 19, 9, 10, 15, 11, 12, 16, 13, 18, 15, 16, 17, 18)
links.apis <- cbind(link.x, link.y)

# mark minimum blue and maximum red
GP1 <- gridPar(pt.bg = "blue", link.col = "blue", pt.size = 1, 
               tar.pt.bg ="red", tar.link.col ="red")
# wing shape change along PC1
plotRefToTarget(M1 = sample.pca$shapes$shapes.comp1$min, 
                M2 = sample.pca$shapes$shapes.comp1$max, 
                gridPars=GP1, method = "points", links = links.apis)

# wing shape change along PC2 
plotRefToTarget(M1 = sample.pca$shapes$shapes.comp2$min, 
                M2 = sample.pca$shapes$shapes.comp2$max, 
                gridPars=GP1, method = "points", links = links.apis)

# countries difer significantly in wing shape
country.MANOVA <- manova(as.matrix(sample.pca.scores[ , compNames]) ~ country, data=sample.pca.scores)
summary(country.MANOVA)
##             Df Pillai approx F num Df den Df    Pr(>F)    
## country     12 3.7005   22.162    408  20280 < 2.2e-16 ***
## Residuals 1712                                            
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Visualization of the first two principal components with the generalized additive model

# PC1
ggplot(data = world) +
  geom_sf() +
  geom_point(data = sample.pca.scores, 
             aes(x = jitter.x, y = jitter.y, colour = Comp1), size = 1) +
  coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) +
  scale_color_viridis(name = "PC1") +
  xlab("Longitude") + ylab("Latitude")

GAM <- mgcv::gam(Comp1 ~ s(longitude, latitude), data = sample.pca.scores)
anova(GAM)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Comp1 ~ s(longitude, latitude)
## 
## Approximate significance of smooth terms:
##                         edf Ref.df     F p-value
## s(longitude,latitude) 26.18  28.43 720.7  <2e-16
viz.GAM <- getViz(GAM)
plot(viz.GAM, 1) +
  l_fitRaster() +
  l_fitContour(color ="grey30") + 
  l_points() +
  labs(title = NULL) +
  scale_fill_continuous(type = "viridis", name = "PC1", na.value = "transparent") +
  scale_x_continuous(limits = c(-11, 32), expand = c(0, 0)) +
  scale_y_continuous(limits = c(34, 57), expand = c(0, 0)) +
  geom_path(data = map_data("world"), aes(x = long, y = lat, group = group),
            color ="black", inherit.aes = FALSE)

# PC2
ggplot(data = world) +
  geom_sf() +
  geom_point(data = sample.pca.scores, 
             aes(x = jitter.x, y = jitter.y, colour = Comp2), size = 1) +
  coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) +
  scale_color_viridis(name = "PC2") +
  xlab("Longitude") + ylab("Latitude")

GAM <- mgcv::gam(Comp2 ~ s(longitude, latitude), data = sample.pca.scores)
anova(GAM)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Comp2 ~ s(longitude, latitude)
## 
## Approximate significance of smooth terms:
##                         edf Ref.df     F p-value
## s(longitude,latitude) 27.86  28.90 58.47  <2e-16
viz.GAM <- getViz(GAM)
plot(viz.GAM, 1) +
  l_fitRaster() + scale_fill_continuous(na.value = "transparent") +
  l_fitContour(color ="grey30") + 
  l_points() +
  labs(title = NULL) +
  scale_fill_continuous(type = "viridis", name = "PC2", na.value = "transparent") +
  scale_x_continuous(limits = c(-11, 32), expand = c(0, 0)) +
  scale_y_continuous(limits = c(34, 57), expand = c(0, 0)) +
  geom_path(data = map_data("world"), aes(x = long, y = lat, group = group),
            color ="black", inherit.aes = FALSE)

Canonical variate analysis based on countries

# Convert 2D array into a 3D array
sample.3D <- arrayspecs(sample.aligned[xy.Names], p, k)
dimnames(sample.3D)[[3]] <- sample.aligned$sample

# use equal prior probability for all groups
n.country <- length(unique(sample.aligned$country)) # number of groups
sample.cva <- CVA(sample.3D, sample.aligned$country, rounds = 10000, cv = TRUE,
                      prior = rep(1/n.country, n.country))
## singular Covariance matrix: General inverse is used. Threshold for zero eigenvalue is 1e-10
sample.cva.scores <- as.data.frame(sample.cva$CVscores)
# remove unwanted spaces in variable names otherwise use `CV 1`
colnames(sample.cva.scores) <- gsub(" ", "", colnames(sample.cva.scores)) 
sample.cva.scores <- cbind(sample.geo.data, sample.cva.scores)

ggplot(sample.cva.scores, aes(x = CV1, y = CV2, shape = country, color = country)) +
  geom_point() +
  scale_shape_manual(name ="Country", values = c(0:2,15:25)) +
  scale_color_manual(name ="Country", values = rainbow(13)) +
  stat_ellipse() 

Confusion matrix

Classification of the samples to countries.

CVA.class <- typprobClass(sample.cva$CVscores, groups = as.factor(sample.pca.scores$country), outlier = 0)
print(CVA.class)
##  cross-validated classification results in frequencies
##     
##       AT  ES  GR  HR  HU  MD  ME  PL  PT  RO  RS  SI  TR
##   AT  10   0   0   0   0   0   0   0   0   0   0   0   0
##   ES   0 414   0   0   0   0   0   0 102   0   0   0   0
##   GR   0   0 239   1   0   0   0   0   0   2   1   0   1
##   HR   0   0   1 127   7   0   0   0   0   1   1  23   0
##   HU   0   0   0   0  17   0   0   0   0   5   0   0   0
##   MD   0   0   0   0   0   9   0   0   0   0   1   0   0
##   ME   0   0   0   0   0   0  17   0   0   0   3   0   0
##   PL   0   0   0   6  11   0   0 231   0   0   0   3   2
##   PT   0  39   0   0   0   0   0   0 153   0   0   0   0
##   RO   0   0   1   3   2   3   0   1   0 180   0   7   0
##   RS   0   0   0   0   0   0   5   0   0   0  15   0   0
##   SI   0   0   0   1   3   0   0   0   0   1   0  16   0
##   TR   0   0   0   0   0   0   0   0   0   0   0   0  60
## 
## 
##  cross-validated classification result in %
##     
##             AT        ES        GR        HR        HU        MD        ME
##   AT 100.00000   0.00000   0.00000   0.00000   0.00000   0.00000   0.00000
##   ES   0.00000  80.23256   0.00000   0.00000   0.00000   0.00000   0.00000
##   GR   0.00000   0.00000  97.95082   0.40984   0.00000   0.00000   0.00000
##   HR   0.00000   0.00000   0.62500  79.37500   4.37500   0.00000   0.00000
##   HU   0.00000   0.00000   0.00000   0.00000  77.27273   0.00000   0.00000
##   MD   0.00000   0.00000   0.00000   0.00000   0.00000  90.00000   0.00000
##   ME   0.00000   0.00000   0.00000   0.00000   0.00000   0.00000  85.00000
##   PL   0.00000   0.00000   0.00000   2.37154   4.34783   0.00000   0.00000
##   PT   0.00000  20.31250   0.00000   0.00000   0.00000   0.00000   0.00000
##   RO   0.00000   0.00000   0.50761   1.52284   1.01523   1.52284   0.00000
##   RS   0.00000   0.00000   0.00000   0.00000   0.00000   0.00000  25.00000
##   SI   0.00000   0.00000   0.00000   4.76190  14.28571   0.00000   0.00000
##   TR   0.00000   0.00000   0.00000   0.00000   0.00000   0.00000   0.00000
##     
##             PL        PT        RO        RS        SI        TR
##   AT   0.00000   0.00000   0.00000   0.00000   0.00000   0.00000
##   ES   0.00000  19.76744   0.00000   0.00000   0.00000   0.00000
##   GR   0.00000   0.00000   0.81967   0.40984   0.00000   0.40984
##   HR   0.00000   0.00000   0.62500   0.62500  14.37500   0.00000
##   HU   0.00000   0.00000  22.72727   0.00000   0.00000   0.00000
##   MD   0.00000   0.00000   0.00000  10.00000   0.00000   0.00000
##   ME   0.00000   0.00000   0.00000  15.00000   0.00000   0.00000
##   PL  91.30435   0.00000   0.00000   0.00000   1.18577   0.79051
##   PT   0.00000  79.68750   0.00000   0.00000   0.00000   0.00000
##   RO   0.50761   0.00000  91.37056   0.00000   3.55330   0.00000
##   RS   0.00000   0.00000   0.00000  75.00000   0.00000   0.00000
##   SI   0.00000   0.00000   4.76190   0.00000  76.19048   0.00000
##   TR   0.00000   0.00000   0.00000   0.00000   0.00000 100.00000
## 
## 
##  overall classification accuracy: 86.26087 %
## 
##  Kappa statistic: 0.83708

Classification of the samples to countries

The map shows only incorrectly classifies samples. Color and shape of the points indicate country to which a sample was classified. The classification with cross-validation

sample.class <- data.frame(sample = sample.geo.data$sample, 
                          country = sample.geo.data$country, 
                          latitude = sample.geo.data$latitude, 
                          longitude = sample.geo.data$longitude, 
                          classified.as = CVA.class$groupaffinCV)

# add column indicating incorrect classification
sample.class$error <- ifelse(sample.class$country == sample.class$classified.as, "OK", "error")

# only the incorrectly classified samples
sample.error <- sample.class[sample.class$error == "error", 
                            c("sample", "country", "latitude", "longitude","classified.as")]

# incorrect countries classification map
ggplot(data = world) +
  geom_sf() +
  geom_point(data = sample.error, 
             aes(x = longitude, y = latitude, 
                 colour = classified.as, 
                 shape = classified.as), size = 2) +
  scale_color_manual(name ="Classified as", values = rainbow(13)) +
  scale_shape_manual(name ="Classified as", values = c(0:2, 15:25)) +
  coord_sf(xlim = c(-11, 33), ylim = c(34, 56), expand = FALSE) +
  labs(x = "Longitude", y = "Latitude")

Differences between countries in wing shape

# Mahalanobis Distnces between countries
dist <- sample.cva$Dist
MD.dist <- as.matrix(dist$GroupdistMaha)
MD.dist
##           AT        ES        GR        HR        HU        MD        ME
## AT  0.000000 10.360206  9.670937  7.467583  6.369374  7.460003  6.928884
## ES 10.360206  0.000000 11.311107 11.134169 10.974498 10.517820 10.505608
## GR  9.670937 11.311107  0.000000  5.249058  6.638746  6.750555  6.415824
## HR  7.467583 11.134169  5.249058  0.000000  2.954997  4.627411  5.274550
## HU  6.369374 10.974498  6.638746  2.954997  0.000000  5.158678  5.792680
## MD  7.460003 10.517820  6.750555  4.627411  5.158678  0.000000  5.853972
## ME  6.928884 10.505608  6.415824  5.274550  5.792680  5.853972  0.000000
## PL  6.986357  9.240762  6.253255  3.910777  3.909524  4.650527  6.207472
## PT 10.271098  1.891779 11.483281 11.224857 10.975430 10.376974 10.428489
## RO  6.367227 10.598027  5.850517  3.767795  3.801100  3.607623  4.882649
## RS  7.079043 10.793296  6.181560  4.718061  5.363764  5.233406  1.871993
## SI  8.096234 11.474258  5.994399  2.451566  3.592021  4.634304  6.506716
## TR  7.759354 10.203552  4.976778  4.316541  5.285749  5.774989  6.026874
##          PL        PT        RO        RS        SI        TR
## AT 6.986357 10.271098  6.367227  7.079043  8.096234  7.759354
## ES 9.240762  1.891779 10.598027 10.793296 11.474258 10.203552
## GR 6.253255 11.483281  5.850517  6.181560  5.994399  4.976778
## HR 3.910777 11.224857  3.767795  4.718061  2.451566  4.316541
## HU 3.909524 10.975430  3.801100  5.363764  3.592021  5.285749
## MD 4.650527 10.376974  3.607623  5.233406  4.634304  5.774989
## ME 6.207472 10.428489  4.882649  1.871993  6.506716  6.026874
## PL 0.000000  9.192864  4.428774  5.812371  4.148400  5.165371
## PT 9.192864  0.000000 10.564756 10.711068 11.439740 10.371718
## RO 4.428774 10.564756  0.000000  4.582683  4.244022  4.349041
## RS 5.812371 10.711068  4.582683  0.000000  5.886266  6.044034
## SI 4.148400 11.439740  4.244022  5.886266  0.000000  5.170372
## TR 5.165371 10.371718  4.349041  6.044034  5.170372  0.000000
# Significance of differences betwen countries
MD.prob <- as.matrix(dist$probsMaha)
MD.prob
##        AT    ES    GR     HR     HU     MD     ME     PL    PT     RO     RS
## AT 0.0000 1e-04 1e-04 0.0001 0.0022 0.0024 0.0009 0.0001 1e-04 0.0002 0.0005
## ES 0.0001 0e+00 1e-04 0.0001 0.0001 0.0001 0.0001 0.0001 1e-04 0.0001 0.0001
## GR 0.0001 1e-04 0e+00 0.0001 0.0001 0.0001 0.0001 0.0001 1e-04 0.0001 0.0001
## HR 0.0001 1e-04 1e-04 0.0000 0.0243 0.0121 0.0001 0.0001 1e-04 0.0001 0.0001
## HU 0.0022 1e-04 1e-04 0.0243 0.0000 0.0169 0.0003 0.0009 1e-04 0.0019 0.0014
## MD 0.0024 1e-04 1e-04 0.0121 0.0169 0.0000 0.0065 0.0106 1e-04 0.0727 0.0167
## ME 0.0009 1e-04 1e-04 0.0001 0.0003 0.0065 0.0000 0.0001 1e-04 0.0001 0.8722
## PL 0.0001 1e-04 1e-04 0.0001 0.0009 0.0106 0.0001 0.0000 1e-04 0.0001 0.0001
## PT 0.0001 1e-04 1e-04 0.0001 0.0001 0.0001 0.0001 0.0001 0e+00 0.0001 0.0001
## RO 0.0002 1e-04 1e-04 0.0001 0.0019 0.0727 0.0001 0.0001 1e-04 0.0000 0.0002
## RS 0.0005 1e-04 1e-04 0.0001 0.0014 0.0167 0.8722 0.0001 1e-04 0.0002 0.0000
## SI 0.0001 1e-04 1e-04 0.0951 0.0526 0.0430 0.0001 0.0004 1e-04 0.0004 0.0005
## TR 0.0001 1e-04 1e-04 0.0001 0.0001 0.0012 0.0001 0.0001 1e-04 0.0001 0.0001
##        SI     TR
## AT 0.0001 0.0001
## ES 0.0001 0.0001
## GR 0.0001 0.0001
## HR 0.0951 0.0001
## HU 0.0526 0.0001
## MD 0.0430 0.0012
## ME 0.0001 0.0001
## PL 0.0004 0.0001
## PT 0.0001 0.0001
## RO 0.0004 0.0001
## RS 0.0005 0.0001
## SI 0.0000 0.0001
## TR 0.0001 0.0000

Similarity trees

# UPGMA tree
tree.upgma <- upgma(MD.dist)
plot(tree.upgma, label.offset = 0.1, cex = 0.5)

Isolation by distance

There is significant correlation between geographical distances and Mahalanobis distances between countries

# calculate geographical distance between countries
country.geo.data <- aggregate(cbind(sample.geo.data$latitude, sample.geo.data$longitude), 
                             by = list(sample.geo.data$country), FUN = mean)
country.geo.data <- reshape::rename(country.geo.data, 
                                    c(Group.1 = "country", V1 = "latitude", V2 = "longitude")) 

geo.dist <- distm(country.geo.data[c("longitude","latitude")], fun = distGeo)
row.names(geo.dist) <- country.geo.data$country
colnames(geo.dist) <- country.geo.data$country
geo.dist <- geo.dist / 1000 # convert meters to kilometers
geo.dist
##           AT        ES        GR        HR        HU        MD        ME
## AT    0.0000 1677.4948 1118.3415  219.0430  247.3659  970.9450  582.7968
## ES 1677.4948    0.0000 2280.7843 1714.9711 1922.1692 2627.6241 1897.1536
## GR 1118.3415 2280.7843    0.0000  905.0233 1029.1889 1003.6747  539.5506
## HR  219.0430 1714.9711  905.0233    0.0000  283.7703  915.0564  366.4523
## HU  247.3659 1922.1692 1029.1889  283.7703    0.0000  725.2592  539.1022
## MD  970.9450 2627.6241 1003.6747  915.0564  725.2592    0.0000  863.2765
## ME  582.7968 1897.1536  539.5506  366.4523  539.1022  863.2765    0.0000
## PL  684.1116 2269.8425 1472.0395  813.1159  535.6565  745.8927 1050.5456
## PT 2115.2567  480.9975 2761.7533 2172.5472 2362.3395 3078.4628 2372.3072
## RO  740.5166 2364.5410  801.3884  650.0248  509.1914  284.0491  581.5350
## RS  594.4712 2076.2853  577.0488  416.3741  457.5673  640.8441  222.4330
## SI  109.1522 1589.4437 1091.2539  191.8487  334.4072 1044.4924  551.7388
## TR 1098.8973 2543.9486  457.4149  929.7696  919.7317  601.6676  649.3431
##           PL        PT        RO        RS        SI        TR
## AT  684.1116 2115.2567  740.5166  594.4712  109.1522 1098.8973
## ES 2269.8425  480.9975 2364.5410 2076.2853 1589.4437 2543.9486
## GR 1472.0395 2761.7533  801.3884  577.0488 1091.2539  457.4149
## HR  813.1159 2172.5472  650.0248  416.3741  191.8487  929.7696
## HU  535.6565 2362.3395  509.1914  457.5673  334.4072  919.7317
## MD  745.8927 3078.4628  284.0491  640.8441 1044.4924  601.6676
## ME 1050.5456 2372.3072  581.5350  222.4330  551.7388  649.3431
## PL    0.0000 2665.4718  727.4901  906.2962  793.2635 1222.9193
## PT 2665.4718    0.0000 2822.2746 2545.8292 2034.4694 3020.5177
## RO  727.4901 2822.2746    0.0000  359.6086  798.7755  495.4293
## RS  906.2962 2545.8292  359.6086    0.0000  604.4626  513.3960
## SI  793.2635 2034.4694  798.7755  604.4626    0.0000 1117.1129
## TR 1222.9193 3020.5177  495.4293  513.3960 1117.1129    0.0000
vegan::mantel(geo.dist, MD.dist, permutations = 9999, method = "spearman")
## 
## Mantel statistic based on Spearman's rank correlation rho 
## 
## Call:
## vegan::mantel(xdis = geo.dist, ydis = MD.dist, method = "spearman",      permutations = 9999) 
## 
## Mantel statistic r: 0.7046 
##       Significance: 0.0015 
## 
## Upper quantiles of permutations (null model):
##   90%   95% 97.5%   99% 
## 0.228 0.312 0.398 0.574 
## Permutation: free
## Number of permutations: 9999
# convert distance table to get distance in one column
MD.distance <- as.dist(MD.dist)
MD.distance <- as.numeric(MD.distance)
MD.distance <- data.frame(t(combn(rownames(MD.dist),2)), MD.distance)

geo.distance <- as.dist(geo.dist)
geo.distance <- as.numeric(geo.distance)
geo.distance <- data.frame(t(combn(rownames(geo.dist),2)), geo.distance)

geo.distance$MD.distance <- MD.distance$MD.distance
geo.distance$dist.name <- paste0(geo.distance$X1, "-", geo.distance$X2)

outliers <- subset(geo.distance, dist.name %in% c("AT-HU","AT-HR", "AT-SI"))

ggplot(geo.distance, aes(x = geo.distance, y = MD.distance)) +
  geom_point() +
  coord_trans(x = "log10") +
  geom_text( data = outliers, aes(x = geo.distance , y = MD.distance, label = dist.name), hjust =-0.2) +
  xlab("Geographical distance (km)") + ylab("Mahalanobis distance between wing shape")

Canonical variate analysis based on regions

# add column with regions consisting of well represented countries or pairs of neighboring countries
sample.aligned$region <- ifelse(sample.aligned$country == "ES"
                                     | sample.class$country == "PT"
                                     , "ES-PT", sample.aligned$country)
sample.aligned$region <- ifelse(sample.aligned$country == "RO"
                                     | sample.class$country == "MD"
                                     , "RO-MD", sample.aligned$region)
sample.aligned$region <- ifelse(sample.aligned$country == "HR"
                                     | sample.class$country == "SI"
                                     , "HR-SI", sample.aligned$region)

# exclude countries with sample size smaller than 25
sample.region <- sample.aligned[sample.aligned$country != "AT"
                               & sample.aligned$country != "HU"
                               & sample.aligned$country != "ME"
                               & sample.aligned$country != "RS", ]

sample.3D <- arrayspecs(sample.region[xy.Names], p, k)
dimnames(sample.3D)[[3]] <- sample.region$sample

# use equal prior probability for all groups
n.region <- length(unique(sample.region$region)) # number of groups
region.cva <- CVA(sample.3D, sample.region$region, rounds = 10000, cv = TRUE, 
                  prior = rep(1/n.region, n.region))
## singular Covariance matrix: General inverse is used. Threshold for zero eigenvalue is 1e-10
region.cva.scores <- as.data.frame(region.cva$CVscores)
# remove unwanted spaces in variable names otherwise use `CV 1`
colnames(region.cva.scores) <- gsub(" ", "", colnames(region.cva.scores)) 

ggplot(region.cva.scores, aes(x = CV1, y = CV2, shape = sample.region$region, color = sample.region$region)) +
  geom_point() +
  scale_shape_manual(name ="Region", values = c(0:2,19,5:6)) +
  scale_color_manual(name ="Region", values = rainbow(6)) +
  stat_ellipse() 

CVA.class.region <- typprobClass(region.cva$CVscores, groups = as.factor(sample.region$region), outlier = 0)
print(CVA.class.region)
##  cross-validated classification results in frequencies
##        
##         ES-PT  GR HR-SI  PL RO-MD  TR
##   ES-PT   708   0     0   0     0   0
##   GR        0 240     1   0     2   1
##   HR-SI     0   1   179   0     1   0
##   PL        0   0     8 241     2   2
##   RO-MD     0   1     7   1   197   1
##   TR        0   0     0   0     0  60
## 
## 
##  cross-validated classification result in %
##        
##             ES-PT        GR     HR-SI        PL     RO-MD        TR
##   ES-PT 100.00000   0.00000   0.00000   0.00000   0.00000   0.00000
##   GR      0.00000  98.36066   0.40984   0.00000   0.81967   0.40984
##   HR-SI   0.00000   0.55249  98.89503   0.00000   0.55249   0.00000
##   PL      0.00000   0.00000   3.16206  95.25692   0.79051   0.79051
##   RO-MD   0.00000   0.48309   3.38164   0.48309  95.16908   0.48309
##   TR      0.00000   0.00000   0.00000   0.00000   0.00000 100.00000
## 
## 
##  overall classification accuracy: 98.30611 %
## 
##  Kappa statistic: 0.9772

Comparison with lineages dataset Nawrocka et al., (2018a) and Nawrocka et al., (2018b)

Read landmark coordinates from lineages dataset

wings.lin <- read.csv("https://zenodo.org/record/7567336/files/Nawrocka_et_al2018.csv", header = TRUE)

# extract sample classifier
wings.lin <- tidyr::separate(
  data = wings.lin,
  col = file,
  sep = c(10), # sample name is in the first 10 characters of file name
  into = c("sample", NA),
  remove = FALSE
) 

# extract lineage classifier
wings.lin <- tidyr::separate(
  data = wings.lin,
  col = file,
  sep = c(1), # lineage code is in the first character of file name
  into = c("lineage", NA),
  remove = FALSE
) 
head(wings.lin, 2)
##                   file lineage     sample  x1  y1  x2  y2  x3  y3  x4  y4  x5
## 1 A-ada-0698-01.dw.png       A A-ada-0698 191 229 207 227 239 276 239 240 238
## 2 A-ada-0698-02.dw.png       A A-ada-0698 166 243 182 241 212 295 216 258 218
##    y5  x6  y6  x7  y7  x8  y8  x9  y9 x10 y10 x11 y11 x12 y12 x13 y13 x14 y14
## 1 185 295 272 331 288 319 274 341 249 325 240 344 214 342 188 348 165 372 276
## 2 204 270 294 303 312 291 297 318 274 301 263 323 240 326 214 336 192 346 302
##   x15 y15 x16 y16 x17 y17 x18 y18 x19 y19
## 1 392 249 438 219 460 214 461 191 118 289
## 2 369 276 414 249 441 245 446 224  91 295

GPA-alignment of lineages dataset

Before analysis all landmark configurations have to be superimposed.

# Convert 2D array into a 3D array
wings.lin.3D <- arrayspecs(wings.lin[xyNames], p, k)
dimnames(wings.lin.3D)[[3]] <- wings.lin$file
# Align the coordinates using Generalized Procrustes Analysis 
GPA.lin <- gpagen(wings.lin.3D, print.progress = FALSE)
# Convert 3D array into a 2D array - opposite to arrayspecs
wings.lin.aligned <- two.d.array(GPA.lin$coords) 
head(wings.lin.aligned, 2)
##                             1.X         1.Y        2.X         2.Y        3.X
## A-ada-0698-01.dw.png -0.2882258 -0.06434325 -0.2501565 -0.06286477 -0.1943604
## A-ada-0698-02.dw.png -0.2923716 -0.06310753 -0.2547431 -0.06445348 -0.1961197
##                             3.Y        4.X         4.Y        5.X        5.Y
## A-ada-0698-01.dw.png 0.06365687 -0.1805428 -0.02027358 -0.1617611 -0.1488831
## A-ada-0698-02.dw.png 0.06734610 -0.1791741 -0.01788917 -0.1633642 -0.1430801
##                              6.X        6.Y        7.X       7.Y          8.X
## A-ada-0698-01.dw.png -0.06227036 0.07582712 0.01551940 0.1269482 -0.007085048
## A-ada-0698-02.dw.png -0.06100973 0.07700318 0.01203246 0.1256895 -0.012781515
##                             8.Y        9.X        9.Y       10.X       10.Y
## A-ada-0698-01.dw.png 0.08970314 0.05380538 0.03986298 0.01995414 0.01273725
## A-ada-0698-02.dw.png 0.08832114 0.05477287 0.04040039 0.01750110 0.01130163
##                            11.X        11.Y       12.X        12.Y      13.X
## A-ada-0698-01.dw.png 0.07423407 -0.04058406 0.07955375 -0.10196906 0.1023722
## A-ada-0698-02.dw.png 0.07342713 -0.03765121 0.08577614 -0.09750888 0.1135807
##                            13.Y      14.X      14.Y      15.X       15.Y
## A-ada-0698-01.dw.png -0.1532877 0.1157134 0.1147105 0.1727068 0.05943997
## A-ada-0698-02.dw.png -0.1466154 0.1141161 0.1113145 0.1729860 0.05559041
##                           16.X        16.Y      17.X         17.Y      18.X
## A-ada-0698-01.dw.png 0.2914686 0.007154321 0.3446812  0.003942388 0.3558432
## A-ada-0698-02.dw.png 0.2832388 0.002084332 0.3468660 -0.001641282 0.3628345
##                             18.Y       19.X       19.Y
## A-ada-0698-01.dw.png -0.04929392 -0.4814499 0.04751676
## A-ada-0698-02.dw.png -0.04945431 -0.4775679 0.04235017

Average aligned coordinates within samples for lineages dataset

sample.lin.aligned <- aggregate(wings.lin.aligned, by = list(wings.lin$sample), FUN = mean)
names(sample.lin.aligned)[names(sample.lin.aligned) == "Group.1"] <- "sample" # rename column

# extract lineage code as classifier
sample.lin.aligned <- tidyr::separate(
    data = sample.lin.aligned,
    col = sample,
    sep = c(1),
    into = c("lineage", NA),
    remove = FALSE
  ) 

geo.data.lin <- read.csv("https://zenodo.org/record/7567336/files/Nawrocka_et_al2018-geo-data.csv", header = TRUE)
sample.lin.aligned$subspecies <- geo.data.lin$subspecies
sample.lin.aligned$country <- geo.data.lin$country
head(sample.lin.aligned, 2)
##       sample lineage        1.X         1.Y        2.X         2.Y        3.X
## 1 A-ada-0698       A -0.2881589 -0.06345784 -0.2516858 -0.06342460 -0.1943791
## 2 A-ada-0700       A -0.2848260 -0.05937522 -0.2504928 -0.06047929 -0.1905145
##          3.Y        4.X         4.Y        5.X        5.Y         6.X
## 1 0.06499985 -0.1814028 -0.01881681 -0.1631677 -0.1450486 -0.06355953
## 2 0.06327772 -0.1835898 -0.01881641 -0.1617977 -0.1466439 -0.07092678
##          6.Y         7.X       7.Y         8.X        8.Y        9.X        9.Y
## 1 0.07545727 0.011953966 0.1247743 -0.01179719 0.09033662 0.05553661 0.04047449
## 2 0.07393102 0.004073201 0.1246405 -0.01836104 0.09146764 0.05396549 0.03975215
##          10.X        10.Y       11.X        11.Y       12.X       12.Y
## 1 0.016877445 0.011009900 0.07311162 -0.03837618 0.08350415 -0.1002200
## 2 0.008618304 0.008977642 0.07310915 -0.04169207 0.08477211 -0.1016867
##        13.X       13.Y      14.X      14.Y      15.X       15.Y      16.X
## 1 0.1101864 -0.1501086 0.1146867 0.1127786 0.1719523 0.05786099 0.2854625
## 2 0.1126206 -0.1516995 0.1105134 0.1122708 0.1694617 0.05566828 0.2880828
##          16.Y      17.X         17.Y      18.X        18.Y       19.X
## 1 0.004669201 0.3480093 0.0003196065 0.3614398 -0.04939595 -0.4785698
## 2 0.003335192 0.3536997 0.0010182266 0.3709922 -0.04559680 -0.4694000
##         19.Y subspecies     country
## 1 0.04616779  adansonii      Guinea
## 2 0.05165077  adansonii Ivory Coast

Canonical variate analysis based on lineages

# Convert 2D array into a 3D array for easier analysis of unknown samples
sample.lin.3D <- arrayspecs(sample.lin.aligned[xy.Names], p, k)
dimnames(sample.lin.3D)[[3]] <- sample.lin.aligned$sample

# use equal prior probability for all groups
n.lin <- length(unique(sample.lin.aligned$lineage)) # number of groups
sample.lin.cva <- CVA(sample.lin.3D, sample.lin.aligned$lineage, rounds = 10000, cv = TRUE, 
                      prior = rep(1/n.lin, n.lin))
## singular Covariance matrix: General inverse is used. Threshold for zero eigenvalue is 1e-10
sample.lin.cva.scores <- as.data.frame(sample.lin.cva$CVscores)
# remove unwanted spaces in variable names otherwise use `CV 1`
colnames(sample.lin.cva.scores) <- gsub(" ", "", colnames(sample.lin.cva.scores)) 

sample.lin.cva.scores <- cbind(sample.lin.aligned$lineage, sample.lin.cva.scores)
names(sample.lin.cva.scores)[1] <- "lineage" # rename column
rownames(sample.lin.cva.scores) <- sample.lin.aligned$sample

ggplot(sample.lin.cva.scores, aes(x = CV1, y = CV2, color = lineage)) +
  geom_point() +
  coord_fixed() +
  stat_ellipse() 

Classification of the samples to lineages

# Convert 2D array into a 3D array
sample.only <- sample.aligned[xy.Names]
unknown.wings <- arrayspecs(sample.only, p, k)
dimnames(unknown.wings)[[3]] <- sample.only$sample

# calculate covarience for each lineage
covariances <- lapply(unique(sample.lin.cva.scores$lineage),
                      function(x)
                        cov(sample.lin.cva.scores[sample.lin.cva.scores$lineage==x,-1]))
means <- aggregate(sample.lin.cva.scores[,names(sample.lin.cva.scores) != "lineage"], 
                  list(sample.lin.cva.scores$lineage), FUN=mean)
rownames(means) <- means$Group.1
means <- means[,names(means) != "Group.1"]

# Projection of the samples to canonical variate space from Nawrocka et al. 2018
groups <- rownames(means)
result.list <- vector(mode = "list", length = nrow(sample.aligned))
CV.list <- vector(mode = "list", length = nrow(sample.aligned))
for (r in 1:nrow(sample.aligned)) { 
  # Align unknown consensus with consensus from reference samples
  unknown.OPA <- procOPA(GPA.lin$consensus, unknown.wings[,,r])
  unknown.aligned <- unknown.OPA$Bhat
  CV.row <- predict(sample.lin.cva, unknown.aligned)

  # create empty list for results
  result <- numeric(length(groups))
  for (i in 1:length(groups)) { 
    MD <- mahalanobis(CV.row, unlist(means[i, ]), as.matrix(covariances[[i]]))
    result[i] <- MD
  }
  result.list[[r]] <- result
  CV.list[[r]] <- CV.row
}
CV.tab <- do.call(rbind, CV.list)
# remove unwanted spaces in variable names otherwise use `CV 1`
colnames(CV.tab) <- gsub(" ", "", colnames(CV.tab)) 
CV.tab <- as.data.frame(CV.tab)
CV.tab <- cbind(CV.tab, sample.aligned$country)
colnames(CV.tab) <- gsub("sample.aligned\\$", "", colnames(CV.tab)) 

# Black ellipses A, C, M and O indicate 95% confidence regions of reference samples from Nawrocka et al. 2018
ggplot(CV.tab, aes(x = CV1, y = CV2, shape = country, color = country))+
  geom_point() +
  scale_shape_manual(name ="Country", values = c(0:2,15:25)) +
  scale_color_manual(name ="Country", values = rainbow(13)) +
  stat_ellipse() +
  stat_ellipse(data = subset(sample.lin.cva.scores, lineage == "A"), 
               mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) + 
  stat_ellipse(data = subset(sample.lin.cva.scores, lineage == "C"), 
               mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) + 
  stat_ellipse(data = subset(sample.lin.cva.scores, lineage == "M"), 
               mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) + 
  stat_ellipse(data = subset(sample.lin.cva.scores, lineage == "O"), 
               mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) + 
  geom_label(means, 
             mapping = aes(x = CV1, y = CV2, label = rownames(means)), inherit.aes = FALSE)

MD.tab <- do.call(rbind, result.list)
MD.tab <- sqrt(MD.tab)
# column names for lineages
colnames(MD.tab) <- groups
MD.tab <- as.data.frame(MD.tab)

lineage <- colnames(MD.tab)[apply(MD.tab, 1, which.min)]
# final column names
colnames(MD.tab) <- paste0("MD.",groups)
MD.tab <- cbind(MD.tab, lineage)
rownames(MD.tab) <- sample.aligned$sample

# frequencies of the lineages
table(MD.tab$lineage)
## 
##   A   C   M   O 
## 179 844 652  50
# frequencies of the lineage in countries
table(sample.aligned$country, MD.tab$lineage)
##     
##        A   C   M   O
##   AT   0  10   0   0
##   ES  52   0 464   0
##   GR  13 190   0  41
##   HR   0 160   0   0
##   HU   0  22   0   0
##   MD   1   8   0   1
##   ME   0  20   0   0
##   PL  97 142   8   6
##   PT  12   0 180   0
##   RO   2 194   0   1
##   RS   0  20   0   0
##   SI   0  21   0   0
##   TR   2  57   0   1
sample.lineages <- MD.tab
sample.lineages <- cbind(sample.lineages, 
                        sample.geo.data$longitude, sample.geo.data$latitude)
colnames(sample.lineages) <- gsub("sample.geo.data\\$", "", colnames(sample.lineages)) 

# Classification of the samples to lineages according to <a href="#Nawrocka et al. 2018">Nawrocka et al. 2018</a>
ggplot(data = world) +
  geom_sf() +
  geom_point(data = sample.lineages, 
             aes(x = jitter.x, y = jitter.y, colour = lineage), size = 1) +
  scale_color_manual(name ="Lineage", values = rainbow(4)) +
  coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) +
  xlab("Longitude") + ylab("Latitude")

# Lineage A
# Mahalanobis Distance to lineage A (with jitter to show multiple samples from one location)
ggplot(data = world) +
  geom_sf() +
  geom_point(data = sample.lineages, 
             aes(x = jitter.x, y = jitter.y, colour = MD.A), size = 1) +
  coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) +
  scale_color_viridis(name = "Distance to\nlineage A", direction = -1) +
  xlab("Longitude") + ylab("Latitude")

# spatial interpolation of the above data using GAM
GAM <- mgcv::gam(MD.A ~ s(longitude, latitude), data = sample.lineages)
viz.GAM <- getViz(GAM)
GAM.A <- plot(viz.GAM, 1) +
  l_fitRaster() +
  l_fitContour(color ="grey30") + 
  l_points() +
  labs(title = NULL) +
  scale_fill_continuous(type = "viridis", direction = -1, name = "Distance to\nlineage A", na.value = "transparent") +
  scale_x_continuous(limits = c(-11, 32), expand = c(0, 0)) +
  scale_y_continuous(limits = c(34, 57), expand = c(0, 0)) +
  geom_path(data = map_data("world"), aes(x = long, y = lat, group = group),
            color ="black", inherit.aes = FALSE)
GAM.A

# Lineage C
# Mahalanobis Distance to lineage C (with jitter to show multiple samples from one location)
ggplot(data = world) +
  geom_sf() +
  geom_point(data = sample.lineages, 
             aes(x = jitter.x, y = jitter.y, colour = MD.C), size = 1) +
  coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) +
  scale_color_viridis(name = "Distance to\nlineage C", direction = -1) +
  xlab("Longitude") + ylab("Latitude")

# spatial interpolation of the above data using GAM
GAM <- mgcv::gam(MD.C ~ s(longitude, latitude), data = sample.lineages)
viz.GAM <- getViz(GAM)
GAM.C <- plot(viz.GAM, 1) +
  l_fitRaster() +
  l_fitContour(color ="grey30") + 
  l_points() +
  labs(title = NULL) +
  scale_fill_continuous(type = "viridis", direction = -1, name = "Distance to\nlineage C", na.value = "transparent") +
  scale_x_continuous(limits = c(-11, 32), expand = c(0, 0)) +
  scale_y_continuous(limits = c(34, 57), expand = c(0, 0)) +
  geom_path(data = map_data("world"), aes(x = long, y = lat, group = group),
            color ="black", inherit.aes = FALSE)
GAM.C

# Lineage M
# Mahalanobis Distance to lineage M (with jitter to show multiple samples from one location)
ggplot(data = world) +
  geom_sf() +
  geom_point(data = sample.lineages, 
             aes(x = jitter.x, y = jitter.y, colour = MD.M), size = 1) +
  coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) +
  scale_color_viridis(name = "Distance to\nlineage M", direction = -1) +
  xlab("Longitude") + ylab("Latitude")

# spatial interpolation of the above data using GAM
GAM <- mgcv::gam(MD.M ~ s(longitude, latitude), data = sample.lineages)
viz.GAM <- getViz(GAM)
GAM.M <- plot(viz.GAM, 1) +
  l_fitRaster() +
  l_fitContour(color ="grey30") + 
  l_points() +
  labs(title = NULL) +
  scale_fill_continuous(type = "viridis", direction = -1, name = "Distance to\nlineage M", na.value = "transparent") +
  scale_x_continuous(limits = c(-11, 32), expand = c(0, 0)) +
  scale_y_continuous(limits = c(34, 57), expand = c(0, 0)) +
  geom_path(data = map_data("world"), aes(x = long, y = lat, group = group),
            color ="black", inherit.aes = FALSE)
GAM.M

# Lineage O
# Mahalanobis Distance to lineage O (with jitter to show multiple samples from one location)
ggplot(data = world) +
  geom_sf() +
  geom_point(data = sample.lineages, 
             aes(x = jitter.x, y = jitter.y, colour = MD.O), size = 1) +
  coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) +
  scale_color_viridis(name = "Distance to\nlineage O", direction = -1) +
  xlab("Longitude") + ylab("Latitude")

# spatial interpolation of the above data using GAM
GAM <- mgcv::gam(MD.O ~ s(longitude, latitude), data = sample.lineages)
viz.GAM <- getViz(GAM)
GAM.O <- plot(viz.GAM, 1) +
  l_fitRaster() +
  l_fitContour(color ="grey30") + 
  l_points() +
  labs(title = NULL) +
  scale_fill_continuous(type = "viridis", direction = -1, name = "Distance to\nlineage O", na.value = "transparent") +
  scale_x_continuous(limits = c(-11, 32), expand = c(0, 0)) +
  scale_y_continuous(limits = c(34, 57), expand = c(0, 0)) +
  geom_path(data = map_data("world"), aes(x = long, y = lat, group = group),
            color ="black", inherit.aes = FALSE)
GAM.O

Identification of unknown samples

As unknown there were used European samples from lineages dataset Nawrocka et al., (2018a).

# use samples from lineages M and C as unknown
sample.unknown <- subset(sample.lin.aligned, lineage == "M" | lineage == "C")

# convert 2D array into a 3D array
unknown.lin <- arrayspecs(sample.unknown[xy.Names], p, k)
dimnames(unknown.lin)[[3]] <- sample.unknown$sample

# calculate CVA scores for unknown samples
CV.list <- vector(mode = "list", length = nrow(sample.unknown))
for (r in 1:nrow(sample.unknown)) { 
  # Align unknown consensus with consensus from reference samples
  unknown.OPA <- procOPA(GPA$consensus, unknown.lin[,,r])
  unknown.aligned <- unknown.OPA$Bhat
  CV.row <- predict(region.cva, unknown.aligned)
  CV.list[[r]] <- CV.row
}
CV.tab <- do.call(rbind, CV.list)
# remove unwanted spaces in variable names otherwise use `CV 1`
colnames(CV.tab) <- gsub(" ", "", colnames(CV.tab)) 
CV.tab <- as.data.frame(CV.tab)

outlier.threshold <- 0.001
typprobs <- typprobClass(CV.tab, region.cva$CVscores, 
                         groups = as.factor(sample.region$region), 
                         outlier = outlier.threshold, sep = FALSE)
print(typprobs)
## 
## 
##  specimens have been classified as:
## 
##  ES-PT    GR HR-SI  none    PL RO-MD    TR 
##     3     3    17    12    11     3     4
# probability of classification for all groups
P.tab <- typprobs$probs
rownames(P.tab) <- sample.unknown$sample
# for each sample find maximum probability
P.max <- apply(P.tab, 1, FUN = max)
# for each sample find region with larges probability
region.max <- colnames(P.tab)[apply(P.tab, 1, which.max)]
# samples with probability below 0.001 are considered as outlines
P.tab.max <- data.frame(country = sample.unknown$country, P.max, region.max)
head(P.tab.max, 2)
##             country       P.max region.max
## C-car-0204  Austria 0.007747604         PL
## C-car-0216 Slovenia 0.016165649      HR-SI
# frequencies of the countries in regions without detection of outliers
table(P.tab.max$country, P.tab.max$region.max)
##           
##            ES-PT GR HR-SI PL RO-MD TR
##   Austria      1  0     3  2     0  0
##   Croatia      0  0     1  0     0  0
##   Denmark      0  0     0  1     0  0
##   England      0  0     0  3     0  0
##   France       0  0     0  3     0  0
##   Greece       0  3     4  0     0  4
##   Hungary      0  0     1  0     1  0
##   Italy        0  0     4  6     1  0
##   Norway       1  0     0  3     0  0
##   Romania      0  0     2  0     0  0
##   Russia       1  0     0  1     0  0
##   Serbia       0  0     2  0     1  0
##   Slovenia     0  0     1  1     0  0
##   Spain        2  0     0  0     0  0
P.tab.max$region.none <- ifelse(P.max < outlier.threshold, "none", region.max)
# frequencies of the countries in regions with detection of outliers
table(P.tab.max$country, P.tab.max$region.none)
##           
##            ES-PT GR HR-SI none PL RO-MD TR
##   Austria      0  0     2    3  1     0  0
##   Croatia      0  0     1    0  0     0  0
##   Denmark      0  0     0    1  0     0  0
##   England      0  0     0    1  2     0  0
##   France       0  0     0    3  0     0  0
##   Greece       0  3     4    0  0     0  4
##   Hungary      0  0     1    0  0     1  0
##   Italy        0  0     4    1  5     1  0
##   Norway       1  0     0    1  2     0  0
##   Romania      0  0     2    0  0     0  0
##   Russia       1  0     0    1  0     0  0
##   Serbia       0  0     2    0  0     1  0
##   Slovenia     0  0     1    0  1     0  0
##   Spain        1  0     0    1  0     0  0
# regions labels
region.cva.scores$region <- sample.region$region
region.cva.means <- aggregate(region.cva.scores[,names(region.cva.scores) != "region"], 
                  list(region.cva.scores$region), FUN = mean)
rownames(region.cva.means) <- region.cva.means$Group.1
region.cva.means <- region.cva.means[,names(region.cva.means) != "Group.1"]

ggplot(CV.tab, aes(x = CV1, y = CV2, shape = sample.unknown$subspecies, color = sample.unknown$subspecies)) +
  geom_point() +
  scale_shape_manual(name ="subspecies", values = c(0:5)) +
  scale_color_manual(name ="subspecies", values = rainbow(6)) +
  stat_ellipse() +
  stat_ellipse(data = subset(region.cva.scores, region == "ES-PT"), 
               mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) + 
  stat_ellipse(data = subset(region.cva.scores, region == "GR"), 
               mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) + 
  stat_ellipse(data = subset(region.cva.scores, region == "HR-SI"), 
               mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) + 
  stat_ellipse(data = subset(region.cva.scores, region == "PL"), 
               mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) + 
  stat_ellipse(data = subset(region.cva.scores, region == "RO-MD"), 
               mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) + 
  stat_ellipse(data = subset(region.cva.scores, region == "TR"), 
               mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) + 
  geom_label(region.cva.means, 
             mapping = aes(x = CV1, y = CV2, label = rownames(region.cva.means)), inherit.aes = FALSE)

References

Nawrocka, A., Kandemir, İ., Fuchs, S., & Tofilski, A. (2018). Computer software for identification of honey bee subspecies and evolutionary lineages. Apidologie, 49(2), 172-184. https://doi.org/10.1007/s13592-017-0538-y

Nawrocka, A., Kandemir, İ., Fuchs, S., & Tofiilski, A. (2018). Dataset: Computer software for identification of honey bee subspecies and evolutionary lineages. Apidologie, 49, 172–184. https://doi.org/10.5281/zenodo.7567336

Oleksa, A., Căuia, E., Siceanu, A., Puškadija, Z., Kovačić, M., Pinto, M.A., Rodrigues, P.J., Hatjina, F., Charistos, L., Bouga, M., Prešern, J., Kandemir, I., Rašić, S., Kusza, S., & Tofilski, A. (2022). Collection of wing images for conservation of honey bees (Apis mellifera) biodiversity in Europe [Data set]. Zenodo. https://doi.org/10.5281/zenodo.7244070

Information about session

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19043)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=Polish_Poland.1250  LC_CTYPE=Polish_Poland.1250   
## [3] LC_MONETARY=Polish_Poland.1250 LC_NUMERIC=C                  
## [5] LC_TIME=Polish_Poland.1250    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] rnaturalearth_0.1.0 viridis_0.6.2       viridisLite_0.4.0  
##  [4] mgcViz_0.1.9        qgam_1.3.4          mgcv_1.8-33        
##  [7] nlme_3.1-149        ggforce_0.3.3       ggplot2_3.3.6      
## [10] geosphere_1.5-14    phangorn_2.5.5      ape_5.4-1          
## [13] Morpho_2.9          shapes_1.2.6        geomorph_4.0.4     
## [16] Matrix_1.4-1        rgl_0.109.6         RRPP_1.3.0         
## 
## loaded via a namespace (and not attached):
##   [1] matrixStats_0.62.0      sf_0.9-6                doParallel_1.0.17      
##   [4] RColorBrewer_1.1-3      tools_4.0.3             bslib_0.4.1            
##   [7] vegan_2.6-2             utf8_1.2.2              R6_2.5.1               
##  [10] KernSmooth_2.23-17      DBI_1.1.3               colorspace_2.0-3       
##  [13] permute_0.9-7           withr_2.5.0             sp_1.5-0               
##  [16] rnaturalearthdata_0.1.0 tidyselect_1.2.0        gridExtra_2.3          
##  [19] GGally_2.1.2            compiler_4.0.3          cli_3.3.0              
##  [22] bezier_1.1.2            isoband_0.2.5           labeling_0.4.2         
##  [25] sass_0.4.1              scales_1.2.1            classInt_0.4-7         
##  [28] quadprog_1.5-8          proxy_0.4-27            stringr_1.4.0          
##  [31] digest_0.6.29           minqa_1.2.4             rmarkdown_2.18         
##  [34] colorRamps_2.3.1        base64enc_0.1-3         jpeg_0.1-9             
##  [37] pkgconfig_2.0.3         htmltools_0.5.3         lme4_1.1-30            
##  [40] maps_3.4.0              highr_0.9               fastmap_1.1.0          
##  [43] htmlwidgets_1.5.4       rlang_1.0.4             rstudioapi_0.14        
##  [46] Rvcg_0.21               shiny_1.7.3             jquerylib_0.1.4        
##  [49] farver_2.1.1            generics_0.1.3          jsonlite_1.8.0         
##  [52] dplyr_1.0.9             magrittr_2.0.1          Rcpp_1.0.9             
##  [55] munsell_0.5.0           fansi_1.0.3             lifecycle_1.0.1        
##  [58] scatterplot3d_0.3-42    stringi_1.7.8           yaml_2.3.5             
##  [61] MASS_7.3-58.1           plyr_1.8.7              grid_4.0.3             
##  [64] parallel_4.0.3          promises_1.2.0.1        miniUI_0.1.1.1         
##  [67] lattice_0.20-41         splines_4.0.3           knitr_1.40             
##  [70] pillar_1.8.1            igraph_1.2.6            boot_1.3-25            
##  [73] codetools_0.2-16        fastmatch_1.1-0         glue_1.6.2             
##  [76] evaluate_0.18           vctrs_0.4.1             nloptr_2.0.3           
##  [79] tweenr_1.0.2            httpuv_1.6.5            foreach_1.5.2          
##  [82] purrr_0.3.4             tidyr_1.2.0             gtable_0.3.1           
##  [85] polyclip_1.10-0         reshape_0.8.9           assertthat_0.2.1       
##  [88] cachem_1.0.6            xfun_0.31               mime_0.12              
##  [91] xtable_1.8-4            e1071_1.7-11            later_1.3.0            
##  [94] class_7.3-17            minpack.lm_1.2-2        tibble_3.1.8           
##  [97] iterators_1.0.14        cluster_2.1.0           units_0.8-0            
## [100] gamm4_0.2-6             ellipsis_0.3.2