--- title: "Honey bee (*Apis mellifera*) wing images: a tool for identification and conservation" author: "Andrzej Oleksa, Eliza Căuia, Adrian Siceanu, Zlatko Puškadija, Marin Kovačić, M. Alice Pinto, Pedro João Rodrigues, Fani Hatjina, Leonidas Charistos, Maria Bouga, Janez Prešern, İrfan Kandemir, Slađan Rašić, Szilvia Kusza, Adam Tofilski" date: "`r Sys.Date()`" output: html_document --- ```{r setup, include = FALSE} knitr::opts_chunk$set( echo = TRUE, message = FALSE, warning = FALSE ) ``` ## Libraries ```{r libraries} # calculations library(geomorph) # GPA library(shapes) # OPA library(Morpho) # CVA library(phangorn) # UPGMA library(geosphere) # distm # plotting and visualization library(ggplot2) # plots ggplot2::theme_set(theme_light()) library(ggforce) # geom_ellipse library(mgcViz) # GAM visualization library(viridis) # plot scale library(rnaturalearth) # maps ``` ## Abbreviations AT - Austria
ES - Spain
GR - Greece
HR - Croatia
HU - Hungary
MD - Moldova
ME - Montenegro
PL - Poland
PT - Portugal
RO - Romania
RS - Serbia
SI - Slovenia
TR - Turkey
## Landmark coordinates Read raw coordinates of 19 landmarks from Zenodo (Oleksa et al., 2022) ```{r read-coordinates} wings <- read.csv("https://zenodo.org/record/7244070/files/EU-raw-coordinates.csv", header = TRUE) # extract sample classifier wings <- tidyr::separate( data = wings, col = file, sep = c(7), # sample name is in the first 7 characters of file name into = c("sample", NA), remove = FALSE ) # extract country classifier wings <- tidyr::separate( data = wings, col = sample, sep = c(2), # country code is in the first 2 characters of sample name into = c("country", NA), remove = FALSE ) head(wings, 2) ``` ## Geographic data Read latitude and longitude. ```{r read-geo-data} geo.data <- read.csv("https://zenodo.org/record/7244070/files/EU-geo-data.csv", header = TRUE) sample.geo.data <- aggregate(cbind(geo.data$latitude, geo.data$longitude), by = list(geo.data$sample), FUN = mean) sample.geo.data <- reshape::rename(sample.geo.data, c(Group.1 = "sample", V1 = "latitude", V2 = "longitude")) # extract country classifier sample.geo.data <- tidyr::separate( data = sample.geo.data, col = sample, sep = c(2), # country code is in the first 2 characters of sample name into = c("country", NA), remove = FALSE ) head(sample.geo.data, 2) ``` ## Map of sampling locations ```{r map} world <- ne_countries(scale = "medium", returnclass = "sf") # jitter the coordinates to show all locations jitter.x <- jitter(sample.geo.data$longitude, amount = 0.2) jitter.y <- jitter(sample.geo.data$latitude, amount = 0.2) ggplot(data = world) + geom_sf() + geom_point(data = sample.geo.data, aes(x = jitter.x, y = jitter.y, colour = country, shape = country), size = 1) + scale_color_manual(name ="Country", values = rainbow(13)) + scale_shape_manual(name ="Country", values = c(0:2,15:25)) + coord_sf(xlim = c(-11, 33), ylim = c(34, 56)) + xlab("Longitude") + ylab("Latitude") ``` ## Variable names In order to avoid mistakes it is safer to use columns names instead of indexes ```{r var-names} p <- 19 # number of landmarks k <- 2 # number of dimensions, in this case 2 for coordinates (x, y) # create coordinates names used by IdentiFly xyNames <- c("x1", "y1") for (i in 2:p) { xyNames <- c(xyNames, paste0("x", i)) xyNames <- c(xyNames, paste0("y", i)) } xyNames # create coordinates names used by geomorph xy.Names <- c("1.X", "1.Y") for (i in 2:p) { xy.Names <- c(xy.Names, paste(i, "X", sep = ".")) xy.Names <- c(xy.Names, paste(i, "Y", sep = ".")) } xy.Names # The number of principal components used is 2*p-4 = 34, which is equal to the degrees of freedom # create principal components names used by prcomp pcNames <- paste0("PC", 1:(2*p-4)) pcNames # create principal components names used by geomorph compNames <- paste0("Comp", 1:(2*p-4)) compNames geoNames <- c("latitude", "longitude") ``` ## GPA-alignment Before analysis all landmark configurations have to be superimposed. ```{r GPA} # Convert 2D array into a 3D array wings.coords <- arrayspecs(wings[xyNames], p, k) dimnames(wings.coords)[[3]] <- wings$file # Align the coordinates using Generalized Procrustes Analysis GPA <- gpagen(wings.coords, print.progress = FALSE) # Convert 3D array into a 2D array - opposite to arrayspecs wings.aligned <- two.d.array(GPA$coords) head(wings.aligned, 2) ``` ## Average aligned coordinates within samples ```{r sample-average} sample.aligned <- aggregate(wings.aligned, by = list(wings$sample), FUN = mean) names(sample.aligned)[names(sample.aligned) == "Group.1"] <- "sample" # rename column # extract country code as classifier sample.aligned <- tidyr::separate( data = sample.aligned, col = sample, sep = c(2), into = c("country", NA), remove = FALSE ) head(sample.aligned, 2) ``` ## Principal component analysis of the samples ```{r PCA} # Convert 2D array into a 3D array sample.3D <- arrayspecs(sample.aligned[xy.Names], p, k) dimnames(sample.3D)[[3]] <- sample.aligned$sample sample.pca <- gm.prcomp(sample.3D) sample.pca.scores <- as.data.frame(sample.pca$x[ , compNames]) sample.pca.scores <- cbind(sample.geo.data, sample.pca.scores) # create plot labels variance.tab <- summary(sample.pca)$PC.summary variance <- variance.tab["Proportion of Variance", "Comp1"] variance <- round(100 * variance, 1) label.x <- paste0("PC1 (", variance, "%)") variance <- variance.tab["Proportion of Variance", "Comp2"] variance <- round(100 * variance, 1) label.y <- paste0("PC2 (", variance, "%)") ggplot(sample.pca.scores, aes(x = Comp1, y = Comp2, shape = sample.aligned$country, color = sample.aligned$country)) + geom_point() + scale_shape_manual(name ="Country", values = c(0:2,15:25)) + scale_color_manual(name ="Country", values = rainbow(13)) + stat_ellipse() + xlab(label.x) + ylab(label.y) ``` ## Differences in wing shape ```{r PCA-wing-shape} # define which landmarks are connected by lines in wireframe graph link.x <- c(1, 1, 2, 2, 3, 3, 4, 4, 5, 6, 7, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 14, 15, 16, 17) link.y <- c(2, 3, 4, 5, 6, 19, 6, 10, 12, 8, 8, 14, 19, 9, 10, 15, 11, 12, 16, 13, 18, 15, 16, 17, 18) links.apis <- cbind(link.x, link.y) # mark minimum blue and maximum red GP1 <- gridPar(pt.bg = "blue", link.col = "blue", pt.size = 1, tar.pt.bg ="red", tar.link.col ="red") # wing shape change along PC1 plotRefToTarget(M1 = sample.pca$shapes$shapes.comp1$min, M2 = sample.pca$shapes$shapes.comp1$max, gridPars=GP1, method = "points", links = links.apis) # wing shape change along PC2 plotRefToTarget(M1 = sample.pca$shapes$shapes.comp2$min, M2 = sample.pca$shapes$shapes.comp2$max, gridPars=GP1, method = "points", links = links.apis) # countries difer significantly in wing shape country.MANOVA <- manova(as.matrix(sample.pca.scores[ , compNames]) ~ country, data=sample.pca.scores) summary(country.MANOVA) ``` ## Visualization of the first two principal components with the generalized additive model ```{r GAM} # PC1 ggplot(data = world) + geom_sf() + geom_point(data = sample.pca.scores, aes(x = jitter.x, y = jitter.y, colour = Comp1), size = 1) + coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) + scale_color_viridis(name = "PC1") + xlab("Longitude") + ylab("Latitude") GAM <- mgcv::gam(Comp1 ~ s(longitude, latitude), data = sample.pca.scores) anova(GAM) viz.GAM <- getViz(GAM) plot(viz.GAM, 1) + l_fitRaster() + l_fitContour(color ="grey30") + l_points() + labs(title = NULL) + scale_fill_continuous(type = "viridis", name = "PC1", na.value = "transparent") + scale_x_continuous(limits = c(-11, 32), expand = c(0, 0)) + scale_y_continuous(limits = c(34, 57), expand = c(0, 0)) + geom_path(data = map_data("world"), aes(x = long, y = lat, group = group), color ="black", inherit.aes = FALSE) # PC2 ggplot(data = world) + geom_sf() + geom_point(data = sample.pca.scores, aes(x = jitter.x, y = jitter.y, colour = Comp2), size = 1) + coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) + scale_color_viridis(name = "PC2") + xlab("Longitude") + ylab("Latitude") GAM <- mgcv::gam(Comp2 ~ s(longitude, latitude), data = sample.pca.scores) anova(GAM) viz.GAM <- getViz(GAM) plot(viz.GAM, 1) + l_fitRaster() + scale_fill_continuous(na.value = "transparent") + l_fitContour(color ="grey30") + l_points() + labs(title = NULL) + scale_fill_continuous(type = "viridis", name = "PC2", na.value = "transparent") + scale_x_continuous(limits = c(-11, 32), expand = c(0, 0)) + scale_y_continuous(limits = c(34, 57), expand = c(0, 0)) + geom_path(data = map_data("world"), aes(x = long, y = lat, group = group), color ="black", inherit.aes = FALSE) ``` ## Canonical variate analysis based on countries ```{r CVA-countries} # Convert 2D array into a 3D array sample.3D <- arrayspecs(sample.aligned[xy.Names], p, k) dimnames(sample.3D)[[3]] <- sample.aligned$sample # use equal prior probability for all groups n.country <- length(unique(sample.aligned$country)) # number of groups sample.cva <- CVA(sample.3D, sample.aligned$country, rounds = 10000, cv = TRUE, prior = rep(1/n.country, n.country)) sample.cva.scores <- as.data.frame(sample.cva$CVscores) # remove unwanted spaces in variable names otherwise use `CV 1` colnames(sample.cva.scores) <- gsub(" ", "", colnames(sample.cva.scores)) sample.cva.scores <- cbind(sample.geo.data, sample.cva.scores) ggplot(sample.cva.scores, aes(x = CV1, y = CV2, shape = country, color = country)) + geom_point() + scale_shape_manual(name ="Country", values = c(0:2,15:25)) + scale_color_manual(name ="Country", values = rainbow(13)) + stat_ellipse() ``` ## Confusion matrix Classification of the samples to countries. ```{r class-matrix} CVA.class <- typprobClass(sample.cva$CVscores, groups = as.factor(sample.pca.scores$country), outlier = 0) print(CVA.class) ``` ## Classification of the samples to countries The map shows only incorrectly classifies samples. Color and shape of the points indicate country to which a sample was classified. The classification with cross-validation ```{r class-map} sample.class <- data.frame(sample = sample.geo.data$sample, country = sample.geo.data$country, latitude = sample.geo.data$latitude, longitude = sample.geo.data$longitude, classified.as = CVA.class$groupaffinCV) # add column indicating incorrect classification sample.class$error <- ifelse(sample.class$country == sample.class$classified.as, "OK", "error") # only the incorrectly classified samples sample.error <- sample.class[sample.class$error == "error", c("sample", "country", "latitude", "longitude","classified.as")] # incorrect countries classification map ggplot(data = world) + geom_sf() + geom_point(data = sample.error, aes(x = longitude, y = latitude, colour = classified.as, shape = classified.as), size = 2) + scale_color_manual(name ="Classified as", values = rainbow(13)) + scale_shape_manual(name ="Classified as", values = c(0:2, 15:25)) + coord_sf(xlim = c(-11, 33), ylim = c(34, 56), expand = FALSE) + labs(x = "Longitude", y = "Latitude") ``` ## Differences between countries in wing shape ```{r class-countries} # Mahalanobis Distnces between countries dist <- sample.cva$Dist MD.dist <- as.matrix(dist$GroupdistMaha) MD.dist # Significance of differences betwen countries MD.prob <- as.matrix(dist$probsMaha) MD.prob ``` ## Similarity trees ```{r tree} # UPGMA tree tree.upgma <- upgma(MD.dist) plot(tree.upgma, label.offset = 0.1, cex = 0.5) ``` ## Isolation by distance There is significant correlation between geographical distances and Mahalanobis distances between countries ```{r isolation} # calculate geographical distance between countries country.geo.data <- aggregate(cbind(sample.geo.data$latitude, sample.geo.data$longitude), by = list(sample.geo.data$country), FUN = mean) country.geo.data <- reshape::rename(country.geo.data, c(Group.1 = "country", V1 = "latitude", V2 = "longitude")) geo.dist <- distm(country.geo.data[c("longitude","latitude")], fun = distGeo) row.names(geo.dist) <- country.geo.data$country colnames(geo.dist) <- country.geo.data$country geo.dist <- geo.dist / 1000 # convert meters to kilometers geo.dist vegan::mantel(geo.dist, MD.dist, permutations = 9999, method = "spearman") # convert distance table to get distance in one column MD.distance <- as.dist(MD.dist) MD.distance <- as.numeric(MD.distance) MD.distance <- data.frame(t(combn(rownames(MD.dist),2)), MD.distance) geo.distance <- as.dist(geo.dist) geo.distance <- as.numeric(geo.distance) geo.distance <- data.frame(t(combn(rownames(geo.dist),2)), geo.distance) geo.distance$MD.distance <- MD.distance$MD.distance geo.distance$dist.name <- paste0(geo.distance$X1, "-", geo.distance$X2) outliers <- subset(geo.distance, dist.name %in% c("AT-HU","AT-HR", "AT-SI")) ggplot(geo.distance, aes(x = geo.distance, y = MD.distance)) + geom_point() + coord_trans(x = "log10") + geom_text( data = outliers, aes(x = geo.distance , y = MD.distance, label = dist.name), hjust =-0.2) + xlab("Geographical distance (km)") + ylab("Mahalanobis distance between wing shape") ``` ## Canonical variate analysis based on regions ```{r CVA-regions} # add column with regions consisting of well represented countries or pairs of neighboring countries sample.aligned$region <- ifelse(sample.aligned$country == "ES" | sample.class$country == "PT" , "ES-PT", sample.aligned$country) sample.aligned$region <- ifelse(sample.aligned$country == "RO" | sample.class$country == "MD" , "RO-MD", sample.aligned$region) sample.aligned$region <- ifelse(sample.aligned$country == "HR" | sample.class$country == "SI" , "HR-SI", sample.aligned$region) # exclude countries with sample size smaller than 25 sample.region <- sample.aligned[sample.aligned$country != "AT" & sample.aligned$country != "HU" & sample.aligned$country != "ME" & sample.aligned$country != "RS", ] sample.3D <- arrayspecs(sample.region[xy.Names], p, k) dimnames(sample.3D)[[3]] <- sample.region$sample # use equal prior probability for all groups n.region <- length(unique(sample.region$region)) # number of groups region.cva <- CVA(sample.3D, sample.region$region, rounds = 10000, cv = TRUE, prior = rep(1/n.region, n.region)) region.cva.scores <- as.data.frame(region.cva$CVscores) # remove unwanted spaces in variable names otherwise use `CV 1` colnames(region.cva.scores) <- gsub(" ", "", colnames(region.cva.scores)) ggplot(region.cva.scores, aes(x = CV1, y = CV2, shape = sample.region$region, color = sample.region$region)) + geom_point() + scale_shape_manual(name ="Region", values = c(0:2,19,5:6)) + scale_color_manual(name ="Region", values = rainbow(6)) + stat_ellipse() CVA.class.region <- typprobClass(region.cva$CVscores, groups = as.factor(sample.region$region), outlier = 0) print(CVA.class.region) ``` # Comparison with lineages dataset Nawrocka et al., (2018a) and Nawrocka et al., (2018b) ## Read landmark coordinates from lineages dataset ```{r read-coordinates-lineage} wings.lin <- read.csv("https://zenodo.org/record/7567336/files/Nawrocka_et_al2018.csv", header = TRUE) # extract sample classifier wings.lin <- tidyr::separate( data = wings.lin, col = file, sep = c(10), # sample name is in the first 10 characters of file name into = c("sample", NA), remove = FALSE ) # extract lineage classifier wings.lin <- tidyr::separate( data = wings.lin, col = file, sep = c(1), # lineage code is in the first character of file name into = c("lineage", NA), remove = FALSE ) head(wings.lin, 2) ``` ## GPA-alignment of lineages dataset Before analysis all landmark configurations have to be superimposed. ```{r GPA-lineage} # Convert 2D array into a 3D array wings.lin.3D <- arrayspecs(wings.lin[xyNames], p, k) dimnames(wings.lin.3D)[[3]] <- wings.lin$file # Align the coordinates using Generalized Procrustes Analysis GPA.lin <- gpagen(wings.lin.3D, print.progress = FALSE) # Convert 3D array into a 2D array - opposite to arrayspecs wings.lin.aligned <- two.d.array(GPA.lin$coords) head(wings.lin.aligned, 2) ``` ## Average aligned coordinates within samples for lineages dataset ```{r sample-average-lineage} sample.lin.aligned <- aggregate(wings.lin.aligned, by = list(wings.lin$sample), FUN = mean) names(sample.lin.aligned)[names(sample.lin.aligned) == "Group.1"] <- "sample" # rename column # extract lineage code as classifier sample.lin.aligned <- tidyr::separate( data = sample.lin.aligned, col = sample, sep = c(1), into = c("lineage", NA), remove = FALSE ) geo.data.lin <- read.csv("https://zenodo.org/record/7567336/files/Nawrocka_et_al2018-geo-data.csv", header = TRUE) sample.lin.aligned$subspecies <- geo.data.lin$subspecies sample.lin.aligned$country <- geo.data.lin$country head(sample.lin.aligned, 2) ``` ## Canonical variate analysis based on lineages ```{r CVA-lineages} # Convert 2D array into a 3D array for easier analysis of unknown samples sample.lin.3D <- arrayspecs(sample.lin.aligned[xy.Names], p, k) dimnames(sample.lin.3D)[[3]] <- sample.lin.aligned$sample # use equal prior probability for all groups n.lin <- length(unique(sample.lin.aligned$lineage)) # number of groups sample.lin.cva <- CVA(sample.lin.3D, sample.lin.aligned$lineage, rounds = 10000, cv = TRUE, prior = rep(1/n.lin, n.lin)) sample.lin.cva.scores <- as.data.frame(sample.lin.cva$CVscores) # remove unwanted spaces in variable names otherwise use `CV 1` colnames(sample.lin.cva.scores) <- gsub(" ", "", colnames(sample.lin.cva.scores)) sample.lin.cva.scores <- cbind(sample.lin.aligned$lineage, sample.lin.cva.scores) names(sample.lin.cva.scores)[1] <- "lineage" # rename column rownames(sample.lin.cva.scores) <- sample.lin.aligned$sample ggplot(sample.lin.cva.scores, aes(x = CV1, y = CV2, color = lineage)) + geom_point() + coord_fixed() + stat_ellipse() ``` ## Classification of the samples to lineages ```{r class-lineages} # Convert 2D array into a 3D array sample.only <- sample.aligned[xy.Names] unknown.wings <- arrayspecs(sample.only, p, k) dimnames(unknown.wings)[[3]] <- sample.only$sample # calculate covarience for each lineage covariances <- lapply(unique(sample.lin.cva.scores$lineage), function(x) cov(sample.lin.cva.scores[sample.lin.cva.scores$lineage==x,-1])) means <- aggregate(sample.lin.cva.scores[,names(sample.lin.cva.scores) != "lineage"], list(sample.lin.cva.scores$lineage), FUN=mean) rownames(means) <- means$Group.1 means <- means[,names(means) != "Group.1"] # Projection of the samples to canonical variate space from Nawrocka et al. 2018 groups <- rownames(means) result.list <- vector(mode = "list", length = nrow(sample.aligned)) CV.list <- vector(mode = "list", length = nrow(sample.aligned)) for (r in 1:nrow(sample.aligned)) { # Align unknown consensus with consensus from reference samples unknown.OPA <- procOPA(GPA.lin$consensus, unknown.wings[,,r]) unknown.aligned <- unknown.OPA$Bhat CV.row <- predict(sample.lin.cva, unknown.aligned) # create empty list for results result <- numeric(length(groups)) for (i in 1:length(groups)) { MD <- mahalanobis(CV.row, unlist(means[i, ]), as.matrix(covariances[[i]])) result[i] <- MD } result.list[[r]] <- result CV.list[[r]] <- CV.row } CV.tab <- do.call(rbind, CV.list) # remove unwanted spaces in variable names otherwise use `CV 1` colnames(CV.tab) <- gsub(" ", "", colnames(CV.tab)) CV.tab <- as.data.frame(CV.tab) CV.tab <- cbind(CV.tab, sample.aligned$country) colnames(CV.tab) <- gsub("sample.aligned\\$", "", colnames(CV.tab)) # Black ellipses A, C, M and O indicate 95% confidence regions of reference samples from Nawrocka et al. 2018 ggplot(CV.tab, aes(x = CV1, y = CV2, shape = country, color = country))+ geom_point() + scale_shape_manual(name ="Country", values = c(0:2,15:25)) + scale_color_manual(name ="Country", values = rainbow(13)) + stat_ellipse() + stat_ellipse(data = subset(sample.lin.cva.scores, lineage == "A"), mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) + stat_ellipse(data = subset(sample.lin.cva.scores, lineage == "C"), mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) + stat_ellipse(data = subset(sample.lin.cva.scores, lineage == "M"), mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) + stat_ellipse(data = subset(sample.lin.cva.scores, lineage == "O"), mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) + geom_label(means, mapping = aes(x = CV1, y = CV2, label = rownames(means)), inherit.aes = FALSE) MD.tab <- do.call(rbind, result.list) MD.tab <- sqrt(MD.tab) # column names for lineages colnames(MD.tab) <- groups MD.tab <- as.data.frame(MD.tab) lineage <- colnames(MD.tab)[apply(MD.tab, 1, which.min)] # final column names colnames(MD.tab) <- paste0("MD.",groups) MD.tab <- cbind(MD.tab, lineage) rownames(MD.tab) <- sample.aligned$sample # frequencies of the lineages table(MD.tab$lineage) # frequencies of the lineage in countries table(sample.aligned$country, MD.tab$lineage) sample.lineages <- MD.tab sample.lineages <- cbind(sample.lineages, sample.geo.data$longitude, sample.geo.data$latitude) colnames(sample.lineages) <- gsub("sample.geo.data\\$", "", colnames(sample.lineages)) # Classification of the samples to lineages according to Nawrocka et al. 2018 ggplot(data = world) + geom_sf() + geom_point(data = sample.lineages, aes(x = jitter.x, y = jitter.y, colour = lineage), size = 1) + scale_color_manual(name ="Lineage", values = rainbow(4)) + coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) + xlab("Longitude") + ylab("Latitude") # Lineage A # Mahalanobis Distance to lineage A (with jitter to show multiple samples from one location) ggplot(data = world) + geom_sf() + geom_point(data = sample.lineages, aes(x = jitter.x, y = jitter.y, colour = MD.A), size = 1) + coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) + scale_color_viridis(name = "Distance to\nlineage A", direction = -1) + xlab("Longitude") + ylab("Latitude") # spatial interpolation of the above data using GAM GAM <- mgcv::gam(MD.A ~ s(longitude, latitude), data = sample.lineages) viz.GAM <- getViz(GAM) GAM.A <- plot(viz.GAM, 1) + l_fitRaster() + l_fitContour(color ="grey30") + l_points() + labs(title = NULL) + scale_fill_continuous(type = "viridis", direction = -1, name = "Distance to\nlineage A", na.value = "transparent") + scale_x_continuous(limits = c(-11, 32), expand = c(0, 0)) + scale_y_continuous(limits = c(34, 57), expand = c(0, 0)) + geom_path(data = map_data("world"), aes(x = long, y = lat, group = group), color ="black", inherit.aes = FALSE) GAM.A # Lineage C # Mahalanobis Distance to lineage C (with jitter to show multiple samples from one location) ggplot(data = world) + geom_sf() + geom_point(data = sample.lineages, aes(x = jitter.x, y = jitter.y, colour = MD.C), size = 1) + coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) + scale_color_viridis(name = "Distance to\nlineage C", direction = -1) + xlab("Longitude") + ylab("Latitude") # spatial interpolation of the above data using GAM GAM <- mgcv::gam(MD.C ~ s(longitude, latitude), data = sample.lineages) viz.GAM <- getViz(GAM) GAM.C <- plot(viz.GAM, 1) + l_fitRaster() + l_fitContour(color ="grey30") + l_points() + labs(title = NULL) + scale_fill_continuous(type = "viridis", direction = -1, name = "Distance to\nlineage C", na.value = "transparent") + scale_x_continuous(limits = c(-11, 32), expand = c(0, 0)) + scale_y_continuous(limits = c(34, 57), expand = c(0, 0)) + geom_path(data = map_data("world"), aes(x = long, y = lat, group = group), color ="black", inherit.aes = FALSE) GAM.C # Lineage M # Mahalanobis Distance to lineage M (with jitter to show multiple samples from one location) ggplot(data = world) + geom_sf() + geom_point(data = sample.lineages, aes(x = jitter.x, y = jitter.y, colour = MD.M), size = 1) + coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) + scale_color_viridis(name = "Distance to\nlineage M", direction = -1) + xlab("Longitude") + ylab("Latitude") # spatial interpolation of the above data using GAM GAM <- mgcv::gam(MD.M ~ s(longitude, latitude), data = sample.lineages) viz.GAM <- getViz(GAM) GAM.M <- plot(viz.GAM, 1) + l_fitRaster() + l_fitContour(color ="grey30") + l_points() + labs(title = NULL) + scale_fill_continuous(type = "viridis", direction = -1, name = "Distance to\nlineage M", na.value = "transparent") + scale_x_continuous(limits = c(-11, 32), expand = c(0, 0)) + scale_y_continuous(limits = c(34, 57), expand = c(0, 0)) + geom_path(data = map_data("world"), aes(x = long, y = lat, group = group), color ="black", inherit.aes = FALSE) GAM.M # Lineage O # Mahalanobis Distance to lineage O (with jitter to show multiple samples from one location) ggplot(data = world) + geom_sf() + geom_point(data = sample.lineages, aes(x = jitter.x, y = jitter.y, colour = MD.O), size = 1) + coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) + scale_color_viridis(name = "Distance to\nlineage O", direction = -1) + xlab("Longitude") + ylab("Latitude") # spatial interpolation of the above data using GAM GAM <- mgcv::gam(MD.O ~ s(longitude, latitude), data = sample.lineages) viz.GAM <- getViz(GAM) GAM.O <- plot(viz.GAM, 1) + l_fitRaster() + l_fitContour(color ="grey30") + l_points() + labs(title = NULL) + scale_fill_continuous(type = "viridis", direction = -1, name = "Distance to\nlineage O", na.value = "transparent") + scale_x_continuous(limits = c(-11, 32), expand = c(0, 0)) + scale_y_continuous(limits = c(34, 57), expand = c(0, 0)) + geom_path(data = map_data("world"), aes(x = long, y = lat, group = group), color ="black", inherit.aes = FALSE) GAM.O ``` ## Identification of unknown samples As unknown there were used European samples from lineages dataset Nawrocka et al., (2018a). ```{r class-unknown} # use samples from lineages M and C as unknown sample.unknown <- subset(sample.lin.aligned, lineage == "M" | lineage == "C") # convert 2D array into a 3D array unknown.lin <- arrayspecs(sample.unknown[xy.Names], p, k) dimnames(unknown.lin)[[3]] <- sample.unknown$sample # calculate CVA scores for unknown samples CV.list <- vector(mode = "list", length = nrow(sample.unknown)) for (r in 1:nrow(sample.unknown)) { # Align unknown consensus with consensus from reference samples unknown.OPA <- procOPA(GPA$consensus, unknown.lin[,,r]) unknown.aligned <- unknown.OPA$Bhat CV.row <- predict(region.cva, unknown.aligned) CV.list[[r]] <- CV.row } CV.tab <- do.call(rbind, CV.list) # remove unwanted spaces in variable names otherwise use `CV 1` colnames(CV.tab) <- gsub(" ", "", colnames(CV.tab)) CV.tab <- as.data.frame(CV.tab) outlier.threshold <- 0.001 typprobs <- typprobClass(CV.tab, region.cva$CVscores, groups = as.factor(sample.region$region), outlier = outlier.threshold, sep = FALSE) print(typprobs) # probability of classification for all groups P.tab <- typprobs$probs rownames(P.tab) <- sample.unknown$sample # for each sample find maximum probability P.max <- apply(P.tab, 1, FUN = max) # for each sample find region with larges probability region.max <- colnames(P.tab)[apply(P.tab, 1, which.max)] # samples with probability below 0.001 are considered as outlines P.tab.max <- data.frame(country = sample.unknown$country, P.max, region.max) head(P.tab.max, 2) # frequencies of the countries in regions without detection of outliers table(P.tab.max$country, P.tab.max$region.max) P.tab.max$region.none <- ifelse(P.max < outlier.threshold, "none", region.max) # frequencies of the countries in regions with detection of outliers table(P.tab.max$country, P.tab.max$region.none) # regions labels region.cva.scores$region <- sample.region$region region.cva.means <- aggregate(region.cva.scores[,names(region.cva.scores) != "region"], list(region.cva.scores$region), FUN = mean) rownames(region.cva.means) <- region.cva.means$Group.1 region.cva.means <- region.cva.means[,names(region.cva.means) != "Group.1"] ggplot(CV.tab, aes(x = CV1, y = CV2, shape = sample.unknown$subspecies, color = sample.unknown$subspecies)) + geom_point() + scale_shape_manual(name ="subspecies", values = c(0:5)) + scale_color_manual(name ="subspecies", values = rainbow(6)) + stat_ellipse() + stat_ellipse(data = subset(region.cva.scores, region == "ES-PT"), mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) + stat_ellipse(data = subset(region.cva.scores, region == "GR"), mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) + stat_ellipse(data = subset(region.cva.scores, region == "HR-SI"), mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) + stat_ellipse(data = subset(region.cva.scores, region == "PL"), mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) + stat_ellipse(data = subset(region.cva.scores, region == "RO-MD"), mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) + stat_ellipse(data = subset(region.cva.scores, region == "TR"), mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) + geom_label(region.cva.means, mapping = aes(x = CV1, y = CV2, label = rownames(region.cva.means)), inherit.aes = FALSE) ``` ## References

Nawrocka, A., Kandemir, İ., Fuchs, S., & Tofilski, A. (2018). Computer software for identification of honey bee subspecies and evolutionary lineages. Apidologie, 49(2), 172-184. https://doi.org/10.1007/s13592-017-0538-y

Nawrocka, A., Kandemir, İ., Fuchs, S., & Tofiilski, A. (2018). Dataset: Computer software for identification of honey bee subspecies and evolutionary lineages. Apidologie, 49, 172–184. https://doi.org/10.5281/zenodo.7567336

Oleksa, A., Căuia, E., Siceanu, A., Puškadija, Z., Kovačić, M., Pinto, M.A., Rodrigues, P.J., Hatjina, F., Charistos, L., Bouga, M., Prešern, J., Kandemir, I., Rašić, S., Kusza, S., & Tofilski, A. (2022). Collection of wing images for conservation of honey bees (*Apis mellifera*) biodiversity in Europe [Data set]. Zenodo. https://doi.org/10.5281/zenodo.7244070

## Information about session ```{r info} sessionInfo() ```