@inproceedings{56938ff6476f43acabc392634d39b1b7,
title = "Audiovisual classification of vocal outbursts in human conversation using long-short-term memory networks",
abstract = "We investigate classification of non-linguistic vocalisations with a novel audiovisual approach and Long Short-Term Memory (LSTM) Recurrent Neural Networks as highly successful dynamic sequence classifiers. As database of evaluation serves this year's Paralinguistic Challenge's Audiovisual Interest Corpus of human-to-human natural conversation. For video-based analysis we compare shape and appearance based features. These are fused in an early manner with typical audio descriptors. The results show significant improvements of LSTM networks over a static approach based on Support Vector Machines. More important, we can show a significant gain in performance when fusing audio and visual shape features.",
keywords = "METIS-285044, IR-79507, Audio signal processing, Support Vector Machines, HMI-MI: MULTIMODAL INTERACTIONS, EC Grant Agreement nr.: FP7/211486, video signal processing, recurrent neural nets, audio-visual systems, EWI-21353",
author = "Florian Eyben and Stavros Petridis and Bj{\"o}rn Schuller and Georgios Tzimiropoulos and Stefanos Zafeiriou and Maja Pantic",
note = "eemcs-eprint-21353 ; IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP 2011 ; Conference date: 22-05-2011 Through 27-05-2011",
year = "2011",
month = may,
doi = "10.1109/ICASSP.2011.5947690",
language = "Undefined",
isbn = "978-1-4577-0538-0",
publisher = "IEEE",
pages = "5844--5847",
booktitle = "IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2011)",
address = "United States",
}