<?xml version="1.0" encoding="ISO-8859-1"?>
<?xml-stylesheet type="text/xsl" href="studyComparison.xslt"?>
<studies>
<study name="Study Name/Author of study">
<organization content="Organization"/>
<reference content="Reference paper"/>
<remark content="Remark"/>
<year content="Year"/>
<speaker-feature content="Classified features: E.g. age, gender, dialect, emotion"/>
<results content="Result: Error values (usually 1-accuracy)"/>
<features content="Features: Which features were computed from the speech signal"/>
<classifier content="Statistical classifier: Which classification algorithm was used"/>
<test-sample content="Test sample:  Kind and length of test material"/>
<languages content="Languages: For which languages was the study done"/>
<resources content="Resources: Anything on footprint or run time"/>
<pre-gender-separation content="Pre-gender-separation: Is a gender-detection used initial to classification?"/>
<data-description content="Data: Which database was used for training and test"/>
<telephone-data content="Telephone data: Training/test database consists of data spoken over telephone lines?"/>
</study>
<study name="Schmitt">
<organization content="Univ. Ulm/SpeechCycle Inc."/>
<reference content="O. Herm, A.Schmitt, J. Liscombe. When Calls Go Wrong:
How to Detect Problematic Calls
Based on Log-Files and Emotions? In Proc. ICSLP 2008, Brisbane"/>
<remark content=""/>
<year content="2008"/>
<speaker-feature content="Prediction of problematic phone calls"/>
<results content="about 79% accuracy after 4 dialog turns (other corpus 90%
after 5 turns, see referenced study). Emotional, i.e. acoustic, features did
not yield improvements"/>
<features content="ASR (transcription, confidence,triggeredgrammarname,
inputmode, matchnomatch, bargedin), NLU (interpretation, loopname), DM
(roleindex, rolename, activitytype, turn duration), acoustic-features (F0,
energy, MFCCs)"/>
<classifier content="SLIPPER"/>
<test-sample content="voice portal data."/>
<languages content="English"/>
<resources content="NA"/>
<pre-gender-separation content="no"/>
<data-description content="subset from 100,00 dialogs: 1911 dialogs in
total. 759 'bad' calls, 1152 'good' calls. Training: 1261 randomly chosen,
Testing: 649 disjunct"/>
<telephone-data content="yes"/>
</study>
<study name="Ang">
<organization content="Berkley/SRI"/>
<reference content="J. Ang et al. Prosody-based automatic detection of annoyance and frustration in human-computer dialog. In J. H. L. Hansen and B. Pellom, editors, Proc. ICSLP 2002"/>
<remark content=""/>
<year content="2002"/>
<speaker-feature content="Emotion"/>
<results content="about 15% for binary decision (negative/else), about 30% for ternary (annoyed, frustrated, else) based solely on acoustic features."/>
<features content="acoustic-features (duration based on phonemes, spectral tilt, F0, energy), words based on ASR (ER ~25%), manually labeled: speaking-style [hyper articulating | pausing | raised voice], repeated requests, data quality problems."/>
<classifier content="CART"/>
<test-sample content="voice portal-utterance."/>
<languages content="English"/>
<resources content="NA"/>
<pre-gender-separation content="no"/>
<data-description content="DARPA data: 830 dialogs, greater than 20000 turns, faked air-travel arrangement system. 75% for training, 25% for test"/>
<telephone-data content="yes"/>
</study>
<study name="Arslan">
<organization content="Duke Univ."/>
<reference content="Arslan and Hansen: Language Accent Classification in American English"/>
<remark content="compared with human performance and humans performed worse than automatic classification."/>
<year content="1996"/>
<speaker-feature content="Accent"/>
<results content="about 30% for three accents (American Turkish Chinese)"/>
<features content="MFCCs, first order differences and energy."/>
<classifier content="HMM"/>
<test-sample content="20 words and 4 sentences"/>
<languages content="English"/>
<resources content="NA"/>
<pre-gender-separation content=""/>
<data-description content="44 male speakers of accents neutral, Turkish, German, Chinese"/>
<telephone-data content="possibly"/>
</study>
<study name="Burkhardt">
<organization content="DTAG"/>
<reference content="F. Burkhardt, M. van Ballegooy, R. Englert, R. Huber: An Emotion-Aware Voice Portal, ESSP 2005"/>
<remark content=""/>
<year content="2005"/>
<speaker-feature content="Emotion"/>
<results content="about 25% for binary classifier"/>
<features content="39 features: pitch, energy duration: min, max mean, first derivative, jitter, ratio of voiced vs. non-voiced parts. "/>
<classifier content="GMMs"/>
<test-sample content="Short Words (voice portal response)"/>
<languages content="German"/>
<resources content="less than 30 * real time"/>
<pre-gender-separation content="no"/>
<data-description content="acted data from students and colleagues."/>
<telephone-data content="yes"/>
</study>
<study name="Chen">
<organization content="Microsoft"/>
<reference content="Chen, Huang, Chang and Wang: Automatic Accent Identification using Gaussian Mixture Models. ASRU 2001"/>
<remark content="Idea to detect accent before speech recognition in order to adapt speech models"/>
<year content="2001"/>
<speaker-feature content="Accent: Gender"/>
<results content="gender: 3% with one utterance, drops to 1% given 50 utterances. Accent 11% for females and 15% for male speakers of 4 accents and 4 utterances."/>
<features content="39 order MFCC: 12 cepstral coefficients, energy, first and second order differentiated."/>
<classifier content="GMMs, error rate drops by 5% if number of components is enlarged from 8 to 64."/>
<test-sample content="3-4 sec. Utterances"/>
<languages content="Mandarin"/>
<resources content="NA"/>
<pre-gender-separation content="yes"/>
<data-description content="multi Accent Mandarin Corpus collected by Microsoft. Contains 300 speakers of 4 mandarin accents, 50 utterances each."/>
<telephone-data content="no"/>
</study>
<study name="Devillers">
<organization content="LIMSI"/>
<reference content="Devillers, L., Vasilescu, I.  , Lori Lamel, L. : Annotation and Detection of Emotion in a Task-oriented Human-Human Dialog Corpus , ISLE Workshop on dialog tagging, Edinburgh, Dec 2002"/>
<remark content="high recognition rate can be explained by the fact that labeling as well as classification was based solely on spoken words."/>
<year content="2002"/>
<speaker-feature content="Emotion"/>
<results content="32% for four emotions + neutral"/>
<features content="words"/>
<classifier content="unigram topic tracker (originally developed to see weather a document concerns a specified topic)"/>
<test-sample content="call-center utterance"/>
<languages content="French"/>
<resources content="NA"/>
<pre-gender-separation content="no"/>
<data-description content="5000 turns from customer-agent 100 dialogs from stock-exchange service application. Manually transcribed."/>
<telephone-data content="yes"/>
</study>
<study name="Grimm">
<organization content="Univ. Karlsruhe"/>
<reference content=" Michael Grimm, Kristian Kroschel, Rule-based Emotion Classification Using Acoustic Features. Proceedings   3. Int. Conf. on Telemedicine and Multimedia Communication, Kajetany/Poland, 2005"/>
<remark content="Recognition of emotion dimensions"/>
<year content="2005"/>
<speaker-feature content="Emotion"/>
<results content=""/>
<features content="pitch, energy, MFCCs"/>
<classifier content="Fuzzy classification"/>
<test-sample content="TV-talk recordings"/>
<languages content="German"/>
<resources content="NA"/>
<pre-gender-separation content="no"/>
<data-description content="TV-talk recordings"/>
<telephone-data content="no"/>
</study>
<study name="Kumpf">
<organization content="Univ. Sidney"/>
<reference content="K. Kumpf and R.W. King: Automatic Accent Classification of Foreign Accented Australian English Speech"/>
<remark content=""/>
<year content="2006"/>
<speaker-feature content="Accent"/>
<results content="about 25% for three accents (neutral, Arabic, Vietnamese)"/>
<features content="MFCC modeling phonemes"/>
<classifier content="HMM with Bi-gram Language Model"/>
<testsample content="3-5 sec length utterances"/>
<languages content="English"/>
<resources content="NA"/>
<pre-gender-separation content="no"/>
<data-description content="ANDOSL database: 100 utterances of 20 speakers of three accents: Australian, Arabic, Vietnamese"/>
<telephone-data content="no"/>
</study>
<study name="Lee">
<organization content="Berkely/ICSI"/>
<reference content="C. M. Lee and S. S. Narayanan, Toward detecting emotions in spoken dialogs, IEEE Transactions on Speech and Audio Processing, 200"/>
<remark content=""/>
<year content="2005"/>
<speaker-feature content="Emotion"/>
<results content="about 20% for binary decision (negative vs. non-negative)"/>
<features content="acoustic features: F0, duration, energy, formants words: emotional salience discourse: manually labeled speech acts used PCA to reduce feature set but quality wasn't improved"/>
<classifier content="compared GMMs with k-NN"/>
<testsample content="voice portal-utterance"/>
<languages content="English"/>
<resources content="NA"/>
<pre-gender-separation content="no"/>
<data-description content="voice portal flight-reservation application: 1200 dialogs with 7200 turns."/>
<telephone-data content="yes"/>
</study>
<study name="Lin and Sims">
<organization content="Hewlett Packard"/>
<reference content="Lin and Simske: Phoneme-less Hierarchical Accent Classification"/>
<remark content=""/>
<year content="2004"/>
<speaker-feature content="Accent"/>
<results content="about 18% for BE vs. US English"/>
<features content="MFCC (Sphinx)"/>
<classifier content="GMM"/>
<testsample content="over 30seconds utterances"/>
<languages content="English"/>
<resources content="NA"/>
<pre-gender-separation content="yes"/>
<data-description content="4 databases: BE train: WSJCAM0, BE test: IViE, US train: TIDIGITS, US test: Voicemail"/>
<telephone-data content="yes"/>
</study>
<study name="Liscombe">
<organization content="Columbia Univ. / AT&amp;T "/>
<reference content="Jackson Liscombe, Giuseppe Riccardi, Dilek Hakkani-Tür, Using Context to Improve Emotion Detection in Spoken Dialog Systems. In the Proceedings of Eurospeech05, 9th European Conference on Speech Communication and Technology, Lisbon, Portugal, September, 2005"/>
<remark content="Beneath prosodic, lexical and dialog act features they model the dialog history as a set of contextual features"/>
<year content="2005"/>
<speaker-feature content="Emotion"/>
<results content="20% for all features and binary classification (negative/non negative)."/>
<features content="Prosodic features energy, pitch, duration based on voice/unvoiced frames lexical features like words and interjections manually labeled. Semiautomatic extraction of phones and pauses. Manually labeled HMIHY Dialog Acts Context Features -greater than deviation from one to next turn."/>
<classifier content="BoosTexter: boosting algorithm for combining results of weak learner decisions."/>
<testsample content="voice portal-utterance"/>
<languages content="English"/>
<resources content="NA"/>
<pre-gender-separation content="no"/>
<data-description content="subset of HMIHY"/>
<telephone-data content="yes"/>
</study>
<study name="Meignier">
<organization content="Univ. Avignon"/>
<reference content="S. Meignier, D. Moraru, C. Fredouille, L. Besacier, J. Bonastre : Benefits of prior acoustic segmentation for automatic speaker segmentation, , ICASSP 2004, Montreal, Canada, Mai 2004"/>
<remark content="Gender detection is used for pre selection while speaker tracking"/>
<year content="2004"/>
<speaker-feature content="Gender"/>
<results content="between 1.5% and 5.5% for two DBs"/>
<features content="NA"/>
<classifier content="GMMs, HMMs"/>
<testsample content="longer utterances (Radio show)"/>
<languages content="English"/>
<resources content="NA"/>
<pre-gender-separation content="no"/>
<data-description content="NIST"/>
<telephone-data content="no"/>
</study>
<study name="Minematsu">
<organization content="Univ. Tokyo"/>
<reference content="N. Minematsu, M. Sekiguchi, and K. Hirose. « Automatic estimation of one’s age with his/her speech based upon acoustic modeling techniques of speakers.». In Proc. IEEE Int $(Bl  (Conference on Acoustic Signal and Speech Processing, pages 137-140,  2002"/>
<remark content="Age-Detection based on subjective listening impression."/>
<year content="2002"/>
<speaker-feature content="Age"/>
<results content="9% for binary decision (age-labeling done by subjective listening impression)"/>
<features content="F0 and Intensity"/>
<classifier content="GMM"/>
<testsample content="longer read text"/>
<languages content="Japanese"/>
<resources content="NA"/>
<pre-gender-separation content="no"/>
<data-description content="JNAS, S-JNAS"/>
<telephone-data content="no"/>
</study>
<study name="Müller">
<organization content="DFKI"/>
<reference content="Müller, Christian u. Wittig, Frank u. Baus, Jörg (2003): „Exploiting Speech for Recognizing Elderly Users to Respond to their Special Needs”, in Proceedings Eurospeech 2003"/>
<remark content=""/>
<year content="2003"/>
<speaker-feature content="Gender, Age"/>
<results content="19% for Gender (26% for women and 14% for men) 4% for age binary (over/under 60 years), but each group different Database!."/>
<features content="Jitter and Shimmer (8 correlated features)"/>
<classifier content="compared CARTs, ANN, kNN, NB, SVM. ANN performed best."/>
<testsample content="whole utterances"/>
<languages content="German"/>
<resources content="NA"/>
<pre-gender-separation content="yes, by combination with Bayes networks."/>
<data-description content="Scansoft corpus for speakers over 60, (10k utterances from 365 speakers), M3I corpus for speakers under 60, 5k utterances from 45 speakers"/>
<telephone-data content="no"/>
</study>
<study name="Parris">
<organization content="Ensigma Ltd."/>
<reference content="E.S. Parris, M.J. Carey: “Language Independent Gender Identification”, ICASSP 1996"/>
<remark content=""/>
<year content="1996"/>
<speaker-feature content="Gender"/>
<results content="5% for 11languages DB, 1% for English."/>
<features content="F0 and Spectrum"/>
<classifier content="HMM"/>
<testsample content="2 sec speech"/>
<languages content="English, Farsi, French, German, Japanese, Korean, Mandarin, Spanish, Tamil, Hindi and Vietnamese"/>
<resources content="NA"/>
<pre-gender-separation content="no"/>
<data-description content="OGI Multilingual Corpus"/>
<telephone-data content="no"/>
</study>
<study name="Petrushin">
<organization content="Accenture"/>
<reference content="V. Petrushin, “Emotion in Speech: Recognition and Application to Call Centers, Artificial. Neu. Net. In Engr. (ANNIE). 1999"/>
<remark content=""/>
<year content="1999"/>
<speaker-feature content="Emotion"/>
<results content="about 23% for binary (agitated vs. calm)"/>
<features content="pitch, formants, energy, duration"/>
<classifier content="ANNs"/>
<testsample content="longer messages (15-90 sec)"/>
<languages content="English"/>
<resources content="NA"/>
<pre-gender-separation content="no"/>
<data-description content="56 messages acted by 18 people"/>
<telephone-data content="yes"/>
</study>
<study name="Schötz">
<organization content="Uni Lundt"/>
<reference content="Schötz, S.: “Automatic prediction of speaker age using CART”. Term paper for course in Speech Recognition, CTT, KTH, Stockholm, 2004"/>
<remark content=""/>
<year content="2003"/>
<speaker-feature content="Gender, Age"/>
<results content="28% ER for binary age-classification, 9% ER for Gender-detection. 0.45 correlation for exact year."/>
<features content="F0 and Formants Mean and Range, Intensity, HNR"/>
<classifier content="CART"/>
<testsample content="short Swedish Wort (rasa)"/>
<languages content="Swedish"/>
<resources content="-"/>
<pre-gender-separation content="yes"/>
<data-description content="SWEDIA2000"/>
<telephone-data content="no"/>
</study>
<study name="Schultz">
<organization content="Univ. Karlsruhe, CMU"/>
<reference content="Schultz, T., Jin, Q., Laskowski, K., Tribble, A. and Waibel, A. :”Speaker, Accent, and Language Identification using Multilingual Phone Strings”.  Proc. of the HLT-2002, San Diego, 2002"/>
<remark content="general model, also used for Speaker-ID and ALI."/>
<year content="2002"/>
<speaker-feature content="Accent"/>
<results content="7% ER for English-speaking Japanese, 16% for two levels concerning ability"/>
<features content="phone strings: phone-models"/>
<classifier content="N-gram Models"/>
<testsample content="middle long sentences (from news text)"/>
<languages content="English/Japanese, but also Chinese, German, French, Croatian, Portuguese, Spanish and Turkish"/>
<resources content="NA"/>
<pre-gender-separation content="no"/>
<data-description content="GlobalPhone"/>
<telephone-data content="no"/>
</study>
<study name="Shafran">
<organization content="AT&amp;T "/>
<reference content="I. Shafran, M. Riley and M. Mohri,  Voice Signatures , Proc. of IEEE Automatic Speech Recognition and Understanding Workshop (ASRU), US Virgin Islands, Nov 30-Dec 4, 2003"/>
<remark content="Age-Detection based on subjective listener impression."/>
<year content="2003"/>
<speaker-feature content="Age, Gender Emotion, Dialect"/>
<results content="5% for Gender and 30% for Age (subjective age, 3 groups), dialect (44%ER for 3 classes) and emotion (32%ER for 2 classes). "/>
<features content="MFCC and F0. also words"/>
<classifier content="HMM"/>
<testsample content="Voice portal Dialog"/>
<languages content="English"/>
<resources content="NA"/>
<pre-gender-separation content=""/>
<data-description content="HMIHY 0300"/>
<telephone-data content="yes"/>
</study>
<study name="Shafran">
<organization content="John Hopkins Univ."/>
<reference content="Izhak Shafran and Mehryar Mohri, A comparison of classifiers for detecting emotion from speech, Proc. of IEEE Inti Conference on Acoustic Signal and Speech Processing (ICASSP), Philadelphia, PA, Mar 19-23, 2005 "/>
<remark content=""/>
<year content="2005"/>
<speaker-feature content="Emotion"/>
<results content="20% for negative/non-negative"/>
<features content="spoken word lattices"/>
<classifier content="SVM"/>
<testsample content="Voice portal Dialog"/>
<languages content="English"/>
<resources content="NA"/>
<pre-gender-separation content=""/>
<data-description content="HMIHY 0300"/>
<telephone-data content="yes"/>
</study>
<study name="Teixiera">
<organization content="INESC Lissabon"/>
<reference content="Carlos Teixeira, Isabel Trancoso, and Antonio Serralheiro. Accent identification. In Proc. International Conference on Spoken Language Processing, Philadelphia. 1996"/>
<remark content=""/>
<year content="1996"/>
<speaker-feature content="Gender, accent"/>
<results content="5% for gender-detection (4% women, 8% men). accent: DA (36%), GER (36%), EN (27%), ES (44%), IT (38%), PT (29%)"/>
<features content="MFCC"/>
<classifier content="GMMs"/>
<testsample content="isolated words"/>
<languages content="For accent: target language English, Native: English, Danish, German, Spanish, Italian, Portuguese"/>
<resources content="NA"/>
<pre-gender-separation content="yes"/>
<data-description content="20 speaker per language, 200 English words"/>
<telephone-data content="no"/>
</study>
<study name="Walker">
<organization content="AT&amp;T "/>
<reference content="Walker, M. A., Langkilde-Geary, I., Wright Hastie, H., Wright, J., and Gorin, A. (2002). Automatically Training a Problematic Dialog Predictor for a Spoken Dialog System. Journal of Artificial Intelligence Research, 16: 293-319"/>
<remark content=""/>
<year content="2001"/>
<speaker-feature content="Emotion"/>
<results content="about 20% after first two utterances"/>
<features content="ASR-recognition, duration, number of words Task-description (1 out of 15) dialog manager (prompt, re prompt, confirm, ...) hand-labeled features (words, age, gender, ...)"/>
<classifier content=""/>
<testsample content="voice portal-utterance"/>
<languages content="English"/>
<resources content="NA"/>
<pre-gender-separation content="no"/>
<data-description content="subset of HMIHY"/>
<telephone-data content="yes"/>
</study>
<study name="Yacoub">
<organization content="HP"/>
<reference content="Yacoub, S. and Simske, S. and Lin, X. and Burns, J: Recognition of Emotions in Interactive Voice Response Systems, Eurospeech 2003"/>
<remark content="Test based on leave-one-speaker-out"/>
<year content="2003"/>
<speaker-feature content="Emotion"/>
<results content="binary (anger/neutral): about 10% for ANN (SVM better with little training), 20-30% for CARTs and KNNs"/>
<features content="39 features: pitch, energy duration: min, max mean, first derivative, jitter, shimmer, ratio of audible vs. inaudible parts. Using only the 19 best ones deteriorated the results by ~ 5 %"/>
<classifier content="Compared ANNs, SVMs, k-nearest neighbor and CARTs"/>
<testsample content="Short sentence (about 3 words)"/>
<languages content="English"/>
<resources content="NA"/>
<pre-gender-separation content="no"/>
<data-description content="2 k utterances of 8 actors acting 15 emotions uniformly distributed, 22kHz "/>
<telephone-data content="no"/>
</study>
<study name="Lugger">
<organization content="Univ. Stuttgart"/>
<reference content="Lugger, M and Yang, B. Classification of different speaking groups by means of voice quality parameters. Proc. ITG Fachtagung Sprachkommunikation 2006"/>
<remark content=""/>
<year content="2006"/>
<speaker-feature content="Gender, Emotion"/>
<results content="Gender: 7% male and 4% female, adding noise favors female recognition. Emotion: about 10-15% with four class-problem, happiness only 40%."/>
<features content="F0, formants and glottal-source estimates."/>
<classifier content="Linear Discriminant Analysis."/>
<testsample content="3 sec. Sentence"/>
<languages content="German"/>
<resources content="NA"/>
<pre-gender-separation content="no"/>
<data-description content="Berlin Emotional Speech Corpus: 10 actors performing 7 emotions with 10 sentences."/>
<telephone-data content="no"/>
</study>
<study name="Dellaert">
<organization content="CMU"/>
<reference content="Dellaert, F. Polzin, T. and Waibel, A.: Recognizing Emotions in Speech, ISCA Workshop on speech and Emotion, 2000"/>
<remark content=""/>
<year content="2000"/>
<speaker-feature content="Emotion"/>
<results content="up to 25% for 4 emotions."/>
<features content="Pitch: min, max, mean ,…"/>
<classifier content="KNN enhanced by feature selection strategies"/>
<testsample content="Short sentence "/>
<languages content="English"/>
<resources content="NA"/>
<pre-gender-separation content="no"/>
<data-description content="Part of Believable Agent Database: 5 speaker acting 4 emotions with 50 sentences"/>
<telephone-data content="no"/>
</study>
<study name="Singer">
<organization content="MIT"/>
<reference content="Singer,  Torres-Carrasquillo, Gleason, Campbell and Reynolds. Acoustic, Phonetic and Discriminative Approaches to Automatic Language Identification. Proc. Eurospeech 2003"/>
<remark content="fused phoneme based, GMM-based and SVM-based classifier with a 108 dimension Gaussian classifier combining the feature-scores of all three classifiers "/>
<year content="2003"/>
<speaker-feature content="Language"/>
<results content="Fusion performed best: from 3% EER for 30sec. Utterance up to 20% for 3 sec. Utterance. Best single classifier: GMMs."/>
<features content="Cepstral-based features"/>
<classifier content="compared phone-based, GMMs and SVMs"/>
<testsample content="NIST style: compared 3 sec, 10 sec 30 sec "/>
<languages content="Arabic, English, Farsi, French, German, Hindi, Japanese, Korean, Mandarin, Spanish, Tamil, Vietnamese"/>
<resources content="NA"/>
<pre-gender-separation content="no"/>
<data-description content="NIST data: 1280 telephone messages originating from CallFriend / callHome corpus"/>
<telephone-data content="yes"/>
</study>
<study name="Li">
<organization content="Institute of Infocomm Research, Georgia Univ."/>
<reference content="Li, Yaman, Lee, Ma, Tong, Zhu and Li. Language Recognition based on Score Distribution Feature Vectors and Discriminative Classifier Fusion. Proc. Odyssey 2006"/>
<remark content="fusion of PPRLM and Vector-based approaches."/>
<year content="2006"/>
<speaker-feature content="Language"/>
<results content="Fusion performed best up to 4% EER."/>
<features content="Cepstral-based features"/>
<classifier content="Compared ANN and LDF (linear discriminant function) with fusion of both"/>
<testsample content="NIST style: compared 3 sec, 10 sec 30 sec "/>
<languages content="Arabic, English, Farsi, French, German, Hindi, Japanese, Korean, Mandarin, Spanish, Tamil, Vietnamese"/>
<resources content="NA"/>
<pre-gender-separation content="no"/>
<data-description content="NIST data: 1280 telephone messages originating from CallFriend / callHome corpus"/>
<telephone-data content="yes"/>
</study>
<study name="Matejka">
<organization content="Brno Univ. of Technology"/>
<reference content="Matejka, P. and Burget, L. and Schwarz, P. and Cernocky, J.. Brno University of Technology System for NIST 2005 Language Recognition Evaluation"/>
<remark content="fusion of PRLM and GMMs"/>
<year content="2005"/>
<speaker-feature content="Language"/>
<results content="Again the fusion system performed best ranging from 3% EER for the 30sec. Stimuli and 14% EER for the 3 sec samples"/>
<features content="Cepstral-based features"/>
<classifier content="combination of phonotactic  (phoneme models) and acoustic classifier (GMMs)"/>
<testsample content="NIST style: compared 3 sec, 10 sec 30 sec "/>
<languages content="Arabic, English, Farsi, French, German, Hindi, Japanese, Korean, Mandarin, Spanish, Tamil, Vietnamese"/>
<resources content="NA"/>
<pre-gender-separation content="no"/>
<data-description content="NIST data: 1280 telephone messages originating from CallFriend / callHome corpus"/>
<telephone-data content="yes"/>
</study>
<study name="Campbell">
<organization content="MIT"/>
<reference content="W. M. Campbell, E. Singer, P. A. Torres-Carrasquillo, D. A. Reynolds  Language Recognition with Support Vector Machines Proc. IEEE Odyssey 2004"/>
<remark content="purely acoustic approach"/>
<year content="2004"/>
<speaker-feature content="Language"/>
<results content="Again the fusion system performed best, 3% EER for the 30sec. samples"/>
<features content="MFCCs and deltas"/>
<classifier content="compared SVMs, GMMs and fusion"/>
<testsample content="NIST style: compared 3 sec, 10 sec 30 sec "/>
<languages content="Arabic, English, Farsi, French, German, Hindi, Japanese, Korean, Mandarin, Spanish, Tamil, Vietnamese"/>
<resources content="NA"/>
<pre-gender-separation content="no"/>
<data-description content="NIST data: 1280 telephone messages originating from CallFriend / callHome corpus"/>
<telephone-data content="yes"/>
</study>
<study name="Blouin">
<organization content="France Telecom"/>
<reference content="Blouin, C. / Maffiolo, V. (2005): A study on the automatic detection and characterization of emotion in a voice service context, In Interspeech-2005, 469-472."/>
<remark content="Classified separately for different word occurrences / utterance lengths. Discuss evolution of emotion in dialog."/>
<year content="2005"/>
<speaker-feature content="Emotion"/>
<results content="about 35% for binary classification (negative - non-negative)"/>
<features content="turn wise: silence/unvoiced/voiced durations, HF/LF energy ration based on MFCCs, F0-contour, Energy contour including micro-contour"/>
<classifier content="LDA classifier"/>
<testsample content="Short words like yes, no or town names."/>
<languages content="french"/>
<resources content="NA"/>
<pre-gender-separation content="yes"/>
<data-description content="Directory inquiring voice service. 1666 dialogs, 8.6 turns per dialog, 40%male 60% female, 35% of dialogs contain negative emotions, 10% of turns labeled as negative"/>
<telephone-data content="yes"/>
</study>
<study name="Sato">
<organization content="Hitachi"/>
<reference content="Nobuo Sato and Yasunari Obuchi: Emotion Recognition using Mel-Frequency Cepstral Coefficients, Information and Media Technologies, Vol. 2 (2007) , No. 3 pp.835-848"/>
<remark content=""/>
<year content="2007"/>
<speaker-feature content="Emotion"/>
<results content="about 34% for three emotions (anger, happy, sad) and neutral"/>
<features content="16 MFCCs"/>
<classifier content="frame-wise k-nearest neighbor"/>
<testsample content="3-4 words, numbers or dates"/>
<languages content="English"/>
<resources content="NA"/>
<pre-gender-separation content="no"/>
<data-description content="LDC emotional speech database (Liberman et al 2002): 7 actors performing 3 emotions and neutral"/>
<telephone-data content="no"/>
</study>
<study name="Neiberg">
<organization content="KTH"/>
<reference content="D. Neiberg, K. Elenius, and K. Laskowski, Emotion recognition in spontaneous speech using GMMs, ICSP 2006"/>
<remark content="Used MFCC-low (20-300 Hz) to model pitch."/>
<year content="2006"/>
<speaker-feature content="Emotion"/>
<results content="Voice Provider: about 5% for binary (neutral/emphatic vs. negative), Meeting Corpus: about 20% for ternary classification (neutral, positive, negative)"/>
<features content="spectral: 39 dimensional MFCC vector containing delta and delta-delta. Pitch: MFCC-low: MFCCs with filter banks 20-300Hz. Also AMDF pitch tracking. Words: 3-grams"/>
<classifier content="GMMs on frame level"/>
<testsample content="Compared voice control commands with discussion-utterances."/>
<languages content="Swedish, English"/>
<resources content="NA"/>
<pre-gender-separation content="no"/>
<data-description content="Compared two databases: 1. Voice Provider material: ~7500 utterances human-machine interaction. 2. ISL Meeting Corpus: about 1200 utterances from meetings"/>
<telephone-data content="yes"/>
</study>
</studies>
