@inproceedings{3b2c8f9fd61749cc8f502262010f1ef3,
title = "The Limits of the Identifiable: Challenges in Python Version Identification with Deep Learning",
abstract = "The evolution of Python requires accurate version identification to facilitate compatibility and ongoing support. We extend previous work on deep learning models for Python version identification, where LSTM and CodeBERT achieved a 92% accuracy on short code snippets. We further expand these results to larger realistic files, utilising code segmentation techniques for varying input granularities. These techniques ranged from per-line analysis to larger code segments. Our findings show that while LSTM with CodeBERT embeddings maintained high accuracy on short snippets, performance significantly drops on longer segments, particularly in balancing information retention and misclassification risks. Notably, import-statement analysis, despite being the most intuitive indicator of version requirements, reached only a 30% accuracy. This exposes the limitations of our approach when encountering rare or user-defined modules. The findings expose the limitations of deep learning for language version identification, and suggest that alternative approaches may be necessary for high accuracy on larger datasets.",
keywords = "CodeBERT, Deep Learning (DL), Python, Software language identification",
author = "Marcus Gerhold and Lola Solovyeva and Vadim Zaytsev",
note = "Publisher Copyright: {\textcopyright} 2024 IEEE.; 31st IEEE International Conference on Software Analysis, Evolution and Reengineering, SANER 2024 ; Conference date: 12-03-2024 Through 15-03-2024",
year = "2024",
month = jul,
day = "16",
doi = "10.1109/SANER60148.2024.00022",
language = "English",
isbn = "979-8-3503-3067-0",
series = "Proceedings IEEE International Conference on Software Analysis, Evolution and Reengineering (SANER)",
publisher = "IEEE",
pages = "137--146",
booktitle = "Proceedings - 2024 IEEE International Conference on Software Analysis, Evolution and Reengineering, SANER 2024",
address = "United States",
}