@inproceedings{3cf70a1ba39445699fd54e1aacd10490,
title = "Effective Data Preprocessing Techniques for CNN-based Selective Sweep Detection",
abstract = "Identifying positive selection has been cast as a classification task, with Convolutional Neural Networks (CNNs) already delivering higher accuracy than summary statistics and likelihood-based approaches. While several CNN-based methods rearrange the pixels of images representing raw genomic data as a preprocessing technique to enhance classification accuracy, the effectiveness of such pixel-rearrangement methods has not been thoroughly studied in the presence of confounding factors such as population bottlenecks and recombination hotspots. Here, we present a series of pixel-rearrangement algorithms to increase CNN classification accuracy for selective sweep detection, and evaluate the performance of four CNN models that are specifically designed for detecting selective sweeps. We find that data preprocessing based on pixel-rearrangement algorithms significantly improves the overall classification accuracy of a given CNN for diverse datasets simulating confounding factors. We observe up to 24.55% higher top-1 accuracy than using the preprocessing algorithms proposed by the authors of each CNN architecture. Furthermore, our results suggest a correlation between the stability of the rearrangement algorithms (over the different CNN architectures and confounding factors) and their performance. Based on these findings, we make suggestions for the most suitable preprocessing technique per CNN architecture used in this study. We provide the data rearrangement algorithms as a distinct module available for download at: https://github.com/Zhaohq96/Genetic-data-rearrangement.",
keywords = "2023 OA procedure, Data preprocessing, Sociology, Genomics, Stability analysis, Classification algorithms, Convolutional neural networks, Correlation",
author = "Hanqing Zhao and Nikolaos Alachiotis",
year = "2023",
month = dec,
day = "8",
doi = "10.1109/BIBM58861.2023.10385303",
language = "English",
isbn = "979-8-3503-3749-5",
series = "IEEE International Conference on Bioinformatics and Biomedicine (BIBM)",
publisher = "IEEE",
pages = "793--800",
booktitle = "2023 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)",
address = "United States",
note = "2023 IEEE International Conference on Bioinformatics and Biomedicine, BIBM 2023 ; Conference date: 05-12-2023 Through 08-12-2023",
}