---
title: "Honey bee (*Apis mellifera*) wing images: a tool for identification and conservation"
author: "Andrzej Oleksa, Eliza Căuia, Adrian Siceanu, Zlatko Puškadija, Marin Kovačić, M. Alice Pinto, Pedro João Rodrigues, Fani Hatjina, Leonidas Charistos, Maria Bouga, Janez Prešern, İrfan Kandemir, Slađan Rašić, Szilvia Kusza, Adam Tofilski"
date: "`r Sys.Date()`"
output: html_document
---
```{r setup, include = FALSE}
knitr::opts_chunk$set(
echo = TRUE,
message = FALSE,
warning = FALSE
)
```
## Libraries
```{r libraries}
# calculations
library(geomorph) # GPA
library(shapes) # OPA
library(Morpho) # CVA
library(phangorn) # UPGMA
library(geosphere) # distm
# plotting and visualization
library(ggplot2) # plots
ggplot2::theme_set(theme_light())
library(ggforce) # geom_ellipse
library(mgcViz) # GAM visualization
library(viridis) # plot scale
library(rnaturalearth) # maps
```
## Abbreviations
AT - Austria
ES - Spain
GR - Greece
HR - Croatia
HU - Hungary
MD - Moldova
ME - Montenegro
PL - Poland
PT - Portugal
RO - Romania
RS - Serbia
SI - Slovenia
TR - Turkey
## Landmark coordinates
Read raw coordinates of 19 landmarks from Zenodo (Oleksa et al., 2022)
```{r read-coordinates}
wings <- read.csv("https://zenodo.org/record/7244070/files/EU-raw-coordinates.csv", header = TRUE)
# extract sample classifier
wings <- tidyr::separate(
data = wings,
col = file,
sep = c(7), # sample name is in the first 7 characters of file name
into = c("sample", NA),
remove = FALSE
)
# extract country classifier
wings <- tidyr::separate(
data = wings,
col = sample,
sep = c(2), # country code is in the first 2 characters of sample name
into = c("country", NA),
remove = FALSE
)
head(wings, 2)
```
## Geographic data
Read latitude and longitude.
```{r read-geo-data}
geo.data <- read.csv("https://zenodo.org/record/7244070/files/EU-geo-data.csv", header = TRUE)
sample.geo.data <- aggregate(cbind(geo.data$latitude, geo.data$longitude), by = list(geo.data$sample), FUN = mean)
sample.geo.data <- reshape::rename(sample.geo.data, c(Group.1 = "sample", V1 = "latitude", V2 = "longitude"))
# extract country classifier
sample.geo.data <- tidyr::separate(
data = sample.geo.data,
col = sample,
sep = c(2), # country code is in the first 2 characters of sample name
into = c("country", NA),
remove = FALSE
)
head(sample.geo.data, 2)
```
## Map of sampling locations
```{r map}
world <- ne_countries(scale = "medium", returnclass = "sf")
# jitter the coordinates to show all locations
jitter.x <- jitter(sample.geo.data$longitude, amount = 0.2)
jitter.y <- jitter(sample.geo.data$latitude, amount = 0.2)
ggplot(data = world) +
geom_sf() +
geom_point(data = sample.geo.data,
aes(x = jitter.x, y = jitter.y, colour = country, shape = country), size = 1) +
scale_color_manual(name ="Country", values = rainbow(13)) +
scale_shape_manual(name ="Country", values = c(0:2,15:25)) +
coord_sf(xlim = c(-11, 33), ylim = c(34, 56)) +
xlab("Longitude") + ylab("Latitude")
```
## Variable names
In order to avoid mistakes it is safer to use columns names instead of
indexes
```{r var-names}
p <- 19 # number of landmarks
k <- 2 # number of dimensions, in this case 2 for coordinates (x, y)
# create coordinates names used by IdentiFly
xyNames <- c("x1", "y1")
for (i in 2:p) {
xyNames <- c(xyNames, paste0("x", i))
xyNames <- c(xyNames, paste0("y", i))
}
xyNames
# create coordinates names used by geomorph
xy.Names <- c("1.X", "1.Y")
for (i in 2:p) {
xy.Names <- c(xy.Names, paste(i, "X", sep = "."))
xy.Names <- c(xy.Names, paste(i, "Y", sep = "."))
}
xy.Names
# The number of principal components used is 2*p-4 = 34, which is equal to the degrees of freedom
# create principal components names used by prcomp
pcNames <- paste0("PC", 1:(2*p-4))
pcNames
# create principal components names used by geomorph
compNames <- paste0("Comp", 1:(2*p-4))
compNames
geoNames <- c("latitude", "longitude")
```
## GPA-alignment
Before analysis all landmark configurations have to be superimposed.
```{r GPA}
# Convert 2D array into a 3D array
wings.coords <- arrayspecs(wings[xyNames], p, k)
dimnames(wings.coords)[[3]] <- wings$file
# Align the coordinates using Generalized Procrustes Analysis
GPA <- gpagen(wings.coords, print.progress = FALSE)
# Convert 3D array into a 2D array - opposite to arrayspecs
wings.aligned <- two.d.array(GPA$coords)
head(wings.aligned, 2)
```
## Average aligned coordinates within samples
```{r sample-average}
sample.aligned <- aggregate(wings.aligned, by = list(wings$sample), FUN = mean)
names(sample.aligned)[names(sample.aligned) == "Group.1"] <- "sample" # rename column
# extract country code as classifier
sample.aligned <- tidyr::separate(
data = sample.aligned,
col = sample,
sep = c(2),
into = c("country", NA),
remove = FALSE
)
head(sample.aligned, 2)
```
## Principal component analysis of the samples
```{r PCA}
# Convert 2D array into a 3D array
sample.3D <- arrayspecs(sample.aligned[xy.Names], p, k)
dimnames(sample.3D)[[3]] <- sample.aligned$sample
sample.pca <- gm.prcomp(sample.3D)
sample.pca.scores <- as.data.frame(sample.pca$x[ , compNames])
sample.pca.scores <- cbind(sample.geo.data, sample.pca.scores)
# create plot labels
variance.tab <- summary(sample.pca)$PC.summary
variance <- variance.tab["Proportion of Variance", "Comp1"]
variance <- round(100 * variance, 1)
label.x <- paste0("PC1 (", variance, "%)")
variance <- variance.tab["Proportion of Variance", "Comp2"]
variance <- round(100 * variance, 1)
label.y <- paste0("PC2 (", variance, "%)")
ggplot(sample.pca.scores, aes(x = Comp1, y = Comp2, shape = sample.aligned$country, color = sample.aligned$country)) +
geom_point() +
scale_shape_manual(name ="Country", values = c(0:2,15:25)) +
scale_color_manual(name ="Country", values = rainbow(13)) +
stat_ellipse() +
xlab(label.x) + ylab(label.y)
```
## Differences in wing shape
```{r PCA-wing-shape}
# define which landmarks are connected by lines in wireframe graph
link.x <- c(1, 1, 2, 2, 3, 3, 4, 4, 5, 6, 7, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 14, 15, 16, 17)
link.y <- c(2, 3, 4, 5, 6, 19, 6, 10, 12, 8, 8, 14, 19, 9, 10, 15, 11, 12, 16, 13, 18, 15, 16, 17, 18)
links.apis <- cbind(link.x, link.y)
# mark minimum blue and maximum red
GP1 <- gridPar(pt.bg = "blue", link.col = "blue", pt.size = 1,
tar.pt.bg ="red", tar.link.col ="red")
# wing shape change along PC1
plotRefToTarget(M1 = sample.pca$shapes$shapes.comp1$min,
M2 = sample.pca$shapes$shapes.comp1$max,
gridPars=GP1, method = "points", links = links.apis)
# wing shape change along PC2
plotRefToTarget(M1 = sample.pca$shapes$shapes.comp2$min,
M2 = sample.pca$shapes$shapes.comp2$max,
gridPars=GP1, method = "points", links = links.apis)
# countries difer significantly in wing shape
country.MANOVA <- manova(as.matrix(sample.pca.scores[ , compNames]) ~ country, data=sample.pca.scores)
summary(country.MANOVA)
```
## Visualization of the first two principal components with the generalized additive model
```{r GAM}
# PC1
ggplot(data = world) +
geom_sf() +
geom_point(data = sample.pca.scores,
aes(x = jitter.x, y = jitter.y, colour = Comp1), size = 1) +
coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) +
scale_color_viridis(name = "PC1") +
xlab("Longitude") + ylab("Latitude")
GAM <- mgcv::gam(Comp1 ~ s(longitude, latitude), data = sample.pca.scores)
anova(GAM)
viz.GAM <- getViz(GAM)
plot(viz.GAM, 1) +
l_fitRaster() +
l_fitContour(color ="grey30") +
l_points() +
labs(title = NULL) +
scale_fill_continuous(type = "viridis", name = "PC1", na.value = "transparent") +
scale_x_continuous(limits = c(-11, 32), expand = c(0, 0)) +
scale_y_continuous(limits = c(34, 57), expand = c(0, 0)) +
geom_path(data = map_data("world"), aes(x = long, y = lat, group = group),
color ="black", inherit.aes = FALSE)
# PC2
ggplot(data = world) +
geom_sf() +
geom_point(data = sample.pca.scores,
aes(x = jitter.x, y = jitter.y, colour = Comp2), size = 1) +
coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) +
scale_color_viridis(name = "PC2") +
xlab("Longitude") + ylab("Latitude")
GAM <- mgcv::gam(Comp2 ~ s(longitude, latitude), data = sample.pca.scores)
anova(GAM)
viz.GAM <- getViz(GAM)
plot(viz.GAM, 1) +
l_fitRaster() + scale_fill_continuous(na.value = "transparent") +
l_fitContour(color ="grey30") +
l_points() +
labs(title = NULL) +
scale_fill_continuous(type = "viridis", name = "PC2", na.value = "transparent") +
scale_x_continuous(limits = c(-11, 32), expand = c(0, 0)) +
scale_y_continuous(limits = c(34, 57), expand = c(0, 0)) +
geom_path(data = map_data("world"), aes(x = long, y = lat, group = group),
color ="black", inherit.aes = FALSE)
```
## Canonical variate analysis based on countries
```{r CVA-countries}
# Convert 2D array into a 3D array
sample.3D <- arrayspecs(sample.aligned[xy.Names], p, k)
dimnames(sample.3D)[[3]] <- sample.aligned$sample
# use equal prior probability for all groups
n.country <- length(unique(sample.aligned$country)) # number of groups
sample.cva <- CVA(sample.3D, sample.aligned$country, rounds = 10000, cv = TRUE,
prior = rep(1/n.country, n.country))
sample.cva.scores <- as.data.frame(sample.cva$CVscores)
# remove unwanted spaces in variable names otherwise use `CV 1`
colnames(sample.cva.scores) <- gsub(" ", "", colnames(sample.cva.scores))
sample.cva.scores <- cbind(sample.geo.data, sample.cva.scores)
ggplot(sample.cva.scores, aes(x = CV1, y = CV2, shape = country, color = country)) +
geom_point() +
scale_shape_manual(name ="Country", values = c(0:2,15:25)) +
scale_color_manual(name ="Country", values = rainbow(13)) +
stat_ellipse()
```
## Confusion matrix
Classification of the samples to countries.
```{r class-matrix}
CVA.class <- typprobClass(sample.cva$CVscores, groups = as.factor(sample.pca.scores$country), outlier = 0)
print(CVA.class)
```
## Classification of the samples to countries
The map shows only incorrectly classifies samples. Color and shape of
the points indicate country to which a sample was classified. The
classification with cross-validation
```{r class-map}
sample.class <- data.frame(sample = sample.geo.data$sample,
country = sample.geo.data$country,
latitude = sample.geo.data$latitude,
longitude = sample.geo.data$longitude,
classified.as = CVA.class$groupaffinCV)
# add column indicating incorrect classification
sample.class$error <- ifelse(sample.class$country == sample.class$classified.as, "OK", "error")
# only the incorrectly classified samples
sample.error <- sample.class[sample.class$error == "error",
c("sample", "country", "latitude", "longitude","classified.as")]
# incorrect countries classification map
ggplot(data = world) +
geom_sf() +
geom_point(data = sample.error,
aes(x = longitude, y = latitude,
colour = classified.as,
shape = classified.as), size = 2) +
scale_color_manual(name ="Classified as", values = rainbow(13)) +
scale_shape_manual(name ="Classified as", values = c(0:2, 15:25)) +
coord_sf(xlim = c(-11, 33), ylim = c(34, 56), expand = FALSE) +
labs(x = "Longitude", y = "Latitude")
```
## Differences between countries in wing shape
```{r class-countries}
# Mahalanobis Distnces between countries
dist <- sample.cva$Dist
MD.dist <- as.matrix(dist$GroupdistMaha)
MD.dist
# Significance of differences betwen countries
MD.prob <- as.matrix(dist$probsMaha)
MD.prob
```
## Similarity trees
```{r tree}
# UPGMA tree
tree.upgma <- upgma(MD.dist)
plot(tree.upgma, label.offset = 0.1, cex = 0.5)
```
## Isolation by distance
There is significant correlation between geographical distances and
Mahalanobis distances between countries
```{r isolation}
# calculate geographical distance between countries
country.geo.data <- aggregate(cbind(sample.geo.data$latitude, sample.geo.data$longitude),
by = list(sample.geo.data$country), FUN = mean)
country.geo.data <- reshape::rename(country.geo.data,
c(Group.1 = "country", V1 = "latitude", V2 = "longitude"))
geo.dist <- distm(country.geo.data[c("longitude","latitude")], fun = distGeo)
row.names(geo.dist) <- country.geo.data$country
colnames(geo.dist) <- country.geo.data$country
geo.dist <- geo.dist / 1000 # convert meters to kilometers
geo.dist
vegan::mantel(geo.dist, MD.dist, permutations = 9999, method = "spearman")
# convert distance table to get distance in one column
MD.distance <- as.dist(MD.dist)
MD.distance <- as.numeric(MD.distance)
MD.distance <- data.frame(t(combn(rownames(MD.dist),2)), MD.distance)
geo.distance <- as.dist(geo.dist)
geo.distance <- as.numeric(geo.distance)
geo.distance <- data.frame(t(combn(rownames(geo.dist),2)), geo.distance)
geo.distance$MD.distance <- MD.distance$MD.distance
geo.distance$dist.name <- paste0(geo.distance$X1, "-", geo.distance$X2)
outliers <- subset(geo.distance, dist.name %in% c("AT-HU","AT-HR", "AT-SI"))
ggplot(geo.distance, aes(x = geo.distance, y = MD.distance)) +
geom_point() +
coord_trans(x = "log10") +
geom_text( data = outliers, aes(x = geo.distance , y = MD.distance, label = dist.name), hjust =-0.2) +
xlab("Geographical distance (km)") + ylab("Mahalanobis distance between wing shape")
```
## Canonical variate analysis based on regions
```{r CVA-regions}
# add column with regions consisting of well represented countries or pairs of neighboring countries
sample.aligned$region <- ifelse(sample.aligned$country == "ES"
| sample.class$country == "PT"
, "ES-PT", sample.aligned$country)
sample.aligned$region <- ifelse(sample.aligned$country == "RO"
| sample.class$country == "MD"
, "RO-MD", sample.aligned$region)
sample.aligned$region <- ifelse(sample.aligned$country == "HR"
| sample.class$country == "SI"
, "HR-SI", sample.aligned$region)
# exclude countries with sample size smaller than 25
sample.region <- sample.aligned[sample.aligned$country != "AT"
& sample.aligned$country != "HU"
& sample.aligned$country != "ME"
& sample.aligned$country != "RS", ]
sample.3D <- arrayspecs(sample.region[xy.Names], p, k)
dimnames(sample.3D)[[3]] <- sample.region$sample
# use equal prior probability for all groups
n.region <- length(unique(sample.region$region)) # number of groups
region.cva <- CVA(sample.3D, sample.region$region, rounds = 10000, cv = TRUE,
prior = rep(1/n.region, n.region))
region.cva.scores <- as.data.frame(region.cva$CVscores)
# remove unwanted spaces in variable names otherwise use `CV 1`
colnames(region.cva.scores) <- gsub(" ", "", colnames(region.cva.scores))
ggplot(region.cva.scores, aes(x = CV1, y = CV2, shape = sample.region$region, color = sample.region$region)) +
geom_point() +
scale_shape_manual(name ="Region", values = c(0:2,19,5:6)) +
scale_color_manual(name ="Region", values = rainbow(6)) +
stat_ellipse()
CVA.class.region <- typprobClass(region.cva$CVscores, groups = as.factor(sample.region$region), outlier = 0)
print(CVA.class.region)
```
# Comparison with lineages dataset Nawrocka et al., (2018a) and Nawrocka et al., (2018b)
## Read landmark coordinates from lineages dataset
```{r read-coordinates-lineage}
wings.lin <- read.csv("https://zenodo.org/record/7567336/files/Nawrocka_et_al2018.csv", header = TRUE)
# extract sample classifier
wings.lin <- tidyr::separate(
data = wings.lin,
col = file,
sep = c(10), # sample name is in the first 10 characters of file name
into = c("sample", NA),
remove = FALSE
)
# extract lineage classifier
wings.lin <- tidyr::separate(
data = wings.lin,
col = file,
sep = c(1), # lineage code is in the first character of file name
into = c("lineage", NA),
remove = FALSE
)
head(wings.lin, 2)
```
## GPA-alignment of lineages dataset
Before analysis all landmark configurations have to be superimposed.
```{r GPA-lineage}
# Convert 2D array into a 3D array
wings.lin.3D <- arrayspecs(wings.lin[xyNames], p, k)
dimnames(wings.lin.3D)[[3]] <- wings.lin$file
# Align the coordinates using Generalized Procrustes Analysis
GPA.lin <- gpagen(wings.lin.3D, print.progress = FALSE)
# Convert 3D array into a 2D array - opposite to arrayspecs
wings.lin.aligned <- two.d.array(GPA.lin$coords)
head(wings.lin.aligned, 2)
```
## Average aligned coordinates within samples for lineages dataset
```{r sample-average-lineage}
sample.lin.aligned <- aggregate(wings.lin.aligned, by = list(wings.lin$sample), FUN = mean)
names(sample.lin.aligned)[names(sample.lin.aligned) == "Group.1"] <- "sample" # rename column
# extract lineage code as classifier
sample.lin.aligned <- tidyr::separate(
data = sample.lin.aligned,
col = sample,
sep = c(1),
into = c("lineage", NA),
remove = FALSE
)
geo.data.lin <- read.csv("https://zenodo.org/record/7567336/files/Nawrocka_et_al2018-geo-data.csv", header = TRUE)
sample.lin.aligned$subspecies <- geo.data.lin$subspecies
sample.lin.aligned$country <- geo.data.lin$country
head(sample.lin.aligned, 2)
```
## Canonical variate analysis based on lineages
```{r CVA-lineages}
# Convert 2D array into a 3D array for easier analysis of unknown samples
sample.lin.3D <- arrayspecs(sample.lin.aligned[xy.Names], p, k)
dimnames(sample.lin.3D)[[3]] <- sample.lin.aligned$sample
# use equal prior probability for all groups
n.lin <- length(unique(sample.lin.aligned$lineage)) # number of groups
sample.lin.cva <- CVA(sample.lin.3D, sample.lin.aligned$lineage, rounds = 10000, cv = TRUE,
prior = rep(1/n.lin, n.lin))
sample.lin.cva.scores <- as.data.frame(sample.lin.cva$CVscores)
# remove unwanted spaces in variable names otherwise use `CV 1`
colnames(sample.lin.cva.scores) <- gsub(" ", "", colnames(sample.lin.cva.scores))
sample.lin.cva.scores <- cbind(sample.lin.aligned$lineage, sample.lin.cva.scores)
names(sample.lin.cva.scores)[1] <- "lineage" # rename column
rownames(sample.lin.cva.scores) <- sample.lin.aligned$sample
ggplot(sample.lin.cva.scores, aes(x = CV1, y = CV2, color = lineage)) +
geom_point() +
coord_fixed() +
stat_ellipse()
```
## Classification of the samples to lineages
```{r class-lineages}
# Convert 2D array into a 3D array
sample.only <- sample.aligned[xy.Names]
unknown.wings <- arrayspecs(sample.only, p, k)
dimnames(unknown.wings)[[3]] <- sample.only$sample
# calculate covarience for each lineage
covariances <- lapply(unique(sample.lin.cva.scores$lineage),
function(x)
cov(sample.lin.cva.scores[sample.lin.cva.scores$lineage==x,-1]))
means <- aggregate(sample.lin.cva.scores[,names(sample.lin.cva.scores) != "lineage"],
list(sample.lin.cva.scores$lineage), FUN=mean)
rownames(means) <- means$Group.1
means <- means[,names(means) != "Group.1"]
# Projection of the samples to canonical variate space from Nawrocka et al. 2018
groups <- rownames(means)
result.list <- vector(mode = "list", length = nrow(sample.aligned))
CV.list <- vector(mode = "list", length = nrow(sample.aligned))
for (r in 1:nrow(sample.aligned)) {
# Align unknown consensus with consensus from reference samples
unknown.OPA <- procOPA(GPA.lin$consensus, unknown.wings[,,r])
unknown.aligned <- unknown.OPA$Bhat
CV.row <- predict(sample.lin.cva, unknown.aligned)
# create empty list for results
result <- numeric(length(groups))
for (i in 1:length(groups)) {
MD <- mahalanobis(CV.row, unlist(means[i, ]), as.matrix(covariances[[i]]))
result[i] <- MD
}
result.list[[r]] <- result
CV.list[[r]] <- CV.row
}
CV.tab <- do.call(rbind, CV.list)
# remove unwanted spaces in variable names otherwise use `CV 1`
colnames(CV.tab) <- gsub(" ", "", colnames(CV.tab))
CV.tab <- as.data.frame(CV.tab)
CV.tab <- cbind(CV.tab, sample.aligned$country)
colnames(CV.tab) <- gsub("sample.aligned\\$", "", colnames(CV.tab))
# Black ellipses A, C, M and O indicate 95% confidence regions of reference samples from Nawrocka et al. 2018
ggplot(CV.tab, aes(x = CV1, y = CV2, shape = country, color = country))+
geom_point() +
scale_shape_manual(name ="Country", values = c(0:2,15:25)) +
scale_color_manual(name ="Country", values = rainbow(13)) +
stat_ellipse() +
stat_ellipse(data = subset(sample.lin.cva.scores, lineage == "A"),
mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) +
stat_ellipse(data = subset(sample.lin.cva.scores, lineage == "C"),
mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) +
stat_ellipse(data = subset(sample.lin.cva.scores, lineage == "M"),
mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) +
stat_ellipse(data = subset(sample.lin.cva.scores, lineage == "O"),
mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) +
geom_label(means,
mapping = aes(x = CV1, y = CV2, label = rownames(means)), inherit.aes = FALSE)
MD.tab <- do.call(rbind, result.list)
MD.tab <- sqrt(MD.tab)
# column names for lineages
colnames(MD.tab) <- groups
MD.tab <- as.data.frame(MD.tab)
lineage <- colnames(MD.tab)[apply(MD.tab, 1, which.min)]
# final column names
colnames(MD.tab) <- paste0("MD.",groups)
MD.tab <- cbind(MD.tab, lineage)
rownames(MD.tab) <- sample.aligned$sample
# frequencies of the lineages
table(MD.tab$lineage)
# frequencies of the lineage in countries
table(sample.aligned$country, MD.tab$lineage)
sample.lineages <- MD.tab
sample.lineages <- cbind(sample.lineages,
sample.geo.data$longitude, sample.geo.data$latitude)
colnames(sample.lineages) <- gsub("sample.geo.data\\$", "", colnames(sample.lineages))
# Classification of the samples to lineages according to Nawrocka et al. 2018
ggplot(data = world) +
geom_sf() +
geom_point(data = sample.lineages,
aes(x = jitter.x, y = jitter.y, colour = lineage), size = 1) +
scale_color_manual(name ="Lineage", values = rainbow(4)) +
coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) +
xlab("Longitude") + ylab("Latitude")
# Lineage A
# Mahalanobis Distance to lineage A (with jitter to show multiple samples from one location)
ggplot(data = world) +
geom_sf() +
geom_point(data = sample.lineages,
aes(x = jitter.x, y = jitter.y, colour = MD.A), size = 1) +
coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) +
scale_color_viridis(name = "Distance to\nlineage A", direction = -1) +
xlab("Longitude") + ylab("Latitude")
# spatial interpolation of the above data using GAM
GAM <- mgcv::gam(MD.A ~ s(longitude, latitude), data = sample.lineages)
viz.GAM <- getViz(GAM)
GAM.A <- plot(viz.GAM, 1) +
l_fitRaster() +
l_fitContour(color ="grey30") +
l_points() +
labs(title = NULL) +
scale_fill_continuous(type = "viridis", direction = -1, name = "Distance to\nlineage A", na.value = "transparent") +
scale_x_continuous(limits = c(-11, 32), expand = c(0, 0)) +
scale_y_continuous(limits = c(34, 57), expand = c(0, 0)) +
geom_path(data = map_data("world"), aes(x = long, y = lat, group = group),
color ="black", inherit.aes = FALSE)
GAM.A
# Lineage C
# Mahalanobis Distance to lineage C (with jitter to show multiple samples from one location)
ggplot(data = world) +
geom_sf() +
geom_point(data = sample.lineages,
aes(x = jitter.x, y = jitter.y, colour = MD.C), size = 1) +
coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) +
scale_color_viridis(name = "Distance to\nlineage C", direction = -1) +
xlab("Longitude") + ylab("Latitude")
# spatial interpolation of the above data using GAM
GAM <- mgcv::gam(MD.C ~ s(longitude, latitude), data = sample.lineages)
viz.GAM <- getViz(GAM)
GAM.C <- plot(viz.GAM, 1) +
l_fitRaster() +
l_fitContour(color ="grey30") +
l_points() +
labs(title = NULL) +
scale_fill_continuous(type = "viridis", direction = -1, name = "Distance to\nlineage C", na.value = "transparent") +
scale_x_continuous(limits = c(-11, 32), expand = c(0, 0)) +
scale_y_continuous(limits = c(34, 57), expand = c(0, 0)) +
geom_path(data = map_data("world"), aes(x = long, y = lat, group = group),
color ="black", inherit.aes = FALSE)
GAM.C
# Lineage M
# Mahalanobis Distance to lineage M (with jitter to show multiple samples from one location)
ggplot(data = world) +
geom_sf() +
geom_point(data = sample.lineages,
aes(x = jitter.x, y = jitter.y, colour = MD.M), size = 1) +
coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) +
scale_color_viridis(name = "Distance to\nlineage M", direction = -1) +
xlab("Longitude") + ylab("Latitude")
# spatial interpolation of the above data using GAM
GAM <- mgcv::gam(MD.M ~ s(longitude, latitude), data = sample.lineages)
viz.GAM <- getViz(GAM)
GAM.M <- plot(viz.GAM, 1) +
l_fitRaster() +
l_fitContour(color ="grey30") +
l_points() +
labs(title = NULL) +
scale_fill_continuous(type = "viridis", direction = -1, name = "Distance to\nlineage M", na.value = "transparent") +
scale_x_continuous(limits = c(-11, 32), expand = c(0, 0)) +
scale_y_continuous(limits = c(34, 57), expand = c(0, 0)) +
geom_path(data = map_data("world"), aes(x = long, y = lat, group = group),
color ="black", inherit.aes = FALSE)
GAM.M
# Lineage O
# Mahalanobis Distance to lineage O (with jitter to show multiple samples from one location)
ggplot(data = world) +
geom_sf() +
geom_point(data = sample.lineages,
aes(x = jitter.x, y = jitter.y, colour = MD.O), size = 1) +
coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) +
scale_color_viridis(name = "Distance to\nlineage O", direction = -1) +
xlab("Longitude") + ylab("Latitude")
# spatial interpolation of the above data using GAM
GAM <- mgcv::gam(MD.O ~ s(longitude, latitude), data = sample.lineages)
viz.GAM <- getViz(GAM)
GAM.O <- plot(viz.GAM, 1) +
l_fitRaster() +
l_fitContour(color ="grey30") +
l_points() +
labs(title = NULL) +
scale_fill_continuous(type = "viridis", direction = -1, name = "Distance to\nlineage O", na.value = "transparent") +
scale_x_continuous(limits = c(-11, 32), expand = c(0, 0)) +
scale_y_continuous(limits = c(34, 57), expand = c(0, 0)) +
geom_path(data = map_data("world"), aes(x = long, y = lat, group = group),
color ="black", inherit.aes = FALSE)
GAM.O
```
## Identification of unknown samples
As unknown there were used European samples from lineages dataset Nawrocka et al., (2018a).
```{r class-unknown}
# use samples from lineages M and C as unknown
sample.unknown <- subset(sample.lin.aligned, lineage == "M" | lineage == "C")
# convert 2D array into a 3D array
unknown.lin <- arrayspecs(sample.unknown[xy.Names], p, k)
dimnames(unknown.lin)[[3]] <- sample.unknown$sample
# calculate CVA scores for unknown samples
CV.list <- vector(mode = "list", length = nrow(sample.unknown))
for (r in 1:nrow(sample.unknown)) {
# Align unknown consensus with consensus from reference samples
unknown.OPA <- procOPA(GPA$consensus, unknown.lin[,,r])
unknown.aligned <- unknown.OPA$Bhat
CV.row <- predict(region.cva, unknown.aligned)
CV.list[[r]] <- CV.row
}
CV.tab <- do.call(rbind, CV.list)
# remove unwanted spaces in variable names otherwise use `CV 1`
colnames(CV.tab) <- gsub(" ", "", colnames(CV.tab))
CV.tab <- as.data.frame(CV.tab)
outlier.threshold <- 0.001
typprobs <- typprobClass(CV.tab, region.cva$CVscores,
groups = as.factor(sample.region$region),
outlier = outlier.threshold, sep = FALSE)
print(typprobs)
# probability of classification for all groups
P.tab <- typprobs$probs
rownames(P.tab) <- sample.unknown$sample
# for each sample find maximum probability
P.max <- apply(P.tab, 1, FUN = max)
# for each sample find region with larges probability
region.max <- colnames(P.tab)[apply(P.tab, 1, which.max)]
# samples with probability below 0.001 are considered as outlines
P.tab.max <- data.frame(country = sample.unknown$country, P.max, region.max)
head(P.tab.max, 2)
# frequencies of the countries in regions without detection of outliers
table(P.tab.max$country, P.tab.max$region.max)
P.tab.max$region.none <- ifelse(P.max < outlier.threshold, "none", region.max)
# frequencies of the countries in regions with detection of outliers
table(P.tab.max$country, P.tab.max$region.none)
# regions labels
region.cva.scores$region <- sample.region$region
region.cva.means <- aggregate(region.cva.scores[,names(region.cva.scores) != "region"],
list(region.cva.scores$region), FUN = mean)
rownames(region.cva.means) <- region.cva.means$Group.1
region.cva.means <- region.cva.means[,names(region.cva.means) != "Group.1"]
ggplot(CV.tab, aes(x = CV1, y = CV2, shape = sample.unknown$subspecies, color = sample.unknown$subspecies)) +
geom_point() +
scale_shape_manual(name ="subspecies", values = c(0:5)) +
scale_color_manual(name ="subspecies", values = rainbow(6)) +
stat_ellipse() +
stat_ellipse(data = subset(region.cva.scores, region == "ES-PT"),
mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) +
stat_ellipse(data = subset(region.cva.scores, region == "GR"),
mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) +
stat_ellipse(data = subset(region.cva.scores, region == "HR-SI"),
mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) +
stat_ellipse(data = subset(region.cva.scores, region == "PL"),
mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) +
stat_ellipse(data = subset(region.cva.scores, region == "RO-MD"),
mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) +
stat_ellipse(data = subset(region.cva.scores, region == "TR"),
mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) +
geom_label(region.cva.means,
mapping = aes(x = CV1, y = CV2, label = rownames(region.cva.means)), inherit.aes = FALSE)
```
## References
Nawrocka, A., Kandemir, İ., Fuchs, S., & Tofilski, A. (2018). Computer software for identification of honey bee subspecies and evolutionary lineages. Apidologie, 49(2), 172-184. https://doi.org/10.1007/s13592-017-0538-y
Nawrocka, A., Kandemir, İ., Fuchs, S., & Tofiilski, A. (2018). Dataset: Computer software for identification of honey bee subspecies and evolutionary lineages. Apidologie, 49, 172–184. https://doi.org/10.5281/zenodo.7567336
Oleksa, A., Căuia, E., Siceanu, A., Puškadija, Z., Kovačić, M., Pinto, M.A., Rodrigues, P.J., Hatjina, F., Charistos, L., Bouga, M., Prešern, J., Kandemir, I., Rašić, S., Kusza, S., & Tofilski, A. (2022). Collection of wing images for conservation of honey bees (*Apis mellifera*) biodiversity in Europe [Data set]. Zenodo. https://doi.org/10.5281/zenodo.7244070
## Information about session ```{r info} sessionInfo() ```