The MIT License (MIT)
Copyright (c) 2014 CNRS
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
AUTHORS
Hervé Bredin -- http://herve.niderb.fr
# setting up IPython Notebook
# for later pretty figures...
%pylab inline
pylab.rcParams['figure.figsize'] = (20.0, 5.0)
from pyannote.core import Annotation, Segment
from pyannote.core.notebook import set_notebook_crop
set_notebook_crop(Segment(0, 20))
from tvd import TheBigBangTheory
dataset = TheBigBangTheory('/Volumes/data/tvd/')
sixEpisodes = dataset.episodes[:6]
firstEpisode = dataset.episodes[0]
targets = ['non_speech', 'SHELDON', 'LEONARD', 'RAJ', 'HOWARD', 'PENNY', 'OTHER']
Experiments are done on the first six episodes of The Big Bang Theory using leave-one-out cross-validation.
def leaveOneOutCrossValidation():
"""(fiveEpisodes, oneEpisode) iterator"""
for e, episode in enumerate(sixEpisodes):
yield sixEpisodes[:e] + sixEpisodes[e+1:], episode
for train, test in leaveOneOutCrossValidation():
print 'Train on episodes {', ' '.join([str(t.episode) for t in train]), '} / Test on episode', test.episode
Precise speaker reference are obtained from The Big Bang Theory TVD subset.
SPEECH, NON_SPEECH = 'speech', 'non_speech'
reference = {}
for episode in sixEpisodes:
manual_annotation = dataset.get_resource('speaker', episode)
translation = {'music_titlesong': 'non_speech',
'silence': 'non_speech',
'sound_laugh': 'non_speech',
'sound_laughclap': 'non_speech',
'sound_other': 'non_speech',
'speech_howard': 'HOWARD',
'speech_leonard': 'LEONARD',
'speech_other': 'OTHER',
'speech_overlapping': 'OTHER',
'speech_penny': 'PENNY',
'speech_raj': 'RAJ',
'speech_sheldon': 'SHELDON'}
reference[episode] = manual_annotation.translate(translation).smooth()
reference[firstEpisode]
Subtitles timespans aligned with transcripts can be used as coarse annotation for training speaker identification.
from pyannote.parser.srt import SRTParser
parser = SRTParser(split=True, # multi-speaker subtitles are split
duration=True) # their duration is interpolated based on their length (duration=True)
subtitles = {}
transcripts = {}
for episode in sixEpisodes:
subtitles[episode] = parser.read(dataset.path_to_subtitles(episode))
transcripts[episode] = dataset.get_resource('transcript', episode)
subtitles[firstEpisode]
transcripts[firstEpisode]
from pyannote.features.text.preprocessing import TextPreProcessing
from pyannote.features.text.tfidf import TFIDF
from pyannote.algorithms.alignment.transcription import TFIDFAlignment
# this is the default text (i.e. subtitles or speech transcript) pre-processing
preprocessing = TextPreProcessing(
tokenize=True, # step 1: tokenization of sentences into words
lemmatize=True, # step 2: lemmatization
stem=True, # step 3: stemming
stopwords=True, # step 4: remove stop-words
pos_tag=True,
keep_pos=True, # step 5: only keep nouns, adjectives, verbs and adverbs
min_length=2) # step 6: remove stems shorter than 2 letters
# this is the TF-IDF transformer that will project
# each pre-processed text into a fixed-size vector
tfidf = TFIDF(preprocessing=preprocessing, # use just-defined pre-processing
binary=True) # use binary term-frequency
aligner = TFIDFAlignment(tfidf, adapt=True)
merged = aligner(transcripts[firstEpisode], subtitles[firstEpisode], vattribute='speech', hattribute='subtitle')
merged
# Build "weak" speaker identification reference based on subtitle timespans aligned with transcripts
weak_reference = {}
for episode in sixEpisodes:
merged = aligner(transcripts[episode], subtitles[episode], vattribute='speech', hattribute='subtitle')
annotation = Annotation(uri=episode)
for start_time, end_time, data in merged.edges_iter(data=True):
if 'speaker' in data:
if data['speaker'] not in ['SHELDON', 'RAJ', 'PENNY', 'LEONARD', 'HOWARD']:
label = 'OTHER'
else:
label = data['speaker']
if start_time.anchored and end_time.anchored:
annotation[Segment(start_time, end_time)] = label
# then fill in the gaps with 'non_speech' segments
extent = Segment(0, dataset.get_episode_duration(episode))
for gap in annotation.get_timeline().gaps(extent):
annotation[gap] = 'non_speech'
weak_reference[episode] = annotation
weak_reference[firstEpisode]
from pyannote.features.audio.yaafe import YaafeMFCC
yaafeMFCC = YaafeMFCC(e=False, De=True, DDe=True, # energy first and second derivatives
coefs=13, D=True, DD=True) # 13 MFCC coefficients + first and second derivatives
mfcc = {}
for episode in sixEpisodes:
mfcc[episode] = yaafeMFCC.extract(dataset.path_to_audio(episode))
NCOMPONENTS = 64
from pyannote.algorithms.classification.hmm import ViterbiHMM
# we will store the results in these dictionary for each episode
hypothesis_full = {}
for trainOnFiveEpisodes, episode in leaveOneOutCrossValidation():
# this is the episode we are testing
# the five other episodes are used for training
print episode
# HMM structure
# - diagonal covariance matrix
# - 250ms minimum duration in each state
hmm = ViterbiHMM(targets=targets,
min_duration={t: 0.250 for t in ['SHELDON', 'LEONARD', 'RAJ', 'HOWARD', 'PENNY', 'OTHER']},
sampling=500,
n_components=NCOMPONENTS,
covariance_type='diag')
# fully supervised (reference) training on five episodes
hmm.fit([reference[e] for e in trainOnFiveEpisodes],
[mfcc[e] for e in trainOnFiveEpisodes])
# testing (Viterbi decoding)
hypothesis_full[episode] = hmm.apply(mfcc[episode])
hypothesis_full[firstEpisode]
As a bonus (not in the paper), we add 2-pass weakly-supervised speaker identification
# we will store the results in these dictionary for each episode
nPass = 2
hypothesis_weak = [dict() for p in range(nPass)]
for trainOnFiveEpisodes, episode in leaveOneOutCrossValidation():
print episode,
# always contains the output of previous pass
hypothesis_pass = {e: weak_reference[e] for e in trainOnFiveEpisodes}
for p in range(nPass):
print p+1,
hmm = ViterbiHMM(targets=targets,
min_duration={t: 0.250 for t in ['SHELDON', 'LEONARD', 'RAJ', 'HOWARD', 'PENNY', 'OTHER']},
sampling=500,
n_components=NCOMPONENTS,
covariance_type='diag')
hmm.fit([hypothesis_pass[e] for e in trainOnFiveEpisodes],
[mfcc[e] for e in trainOnFiveEpisodes])
hypothesis_weak[p][episode] = hmm.apply(mfcc[episode])
hypothesis_pass = {e: hmm.apply(mfcc[e]) for e in trainOnFiveEpisodes}
print
hypothesis_full[firstEpisode]
hypothesis_weak[0][firstEpisode]
from pyannote.metrics.identification import IdentificationErrorRate
ier_full = IdentificationErrorRate()
ier_weak = [IdentificationErrorRate() for p in range(nPass)]
ier_subs = IdentificationErrorRate()
line = 'EPISODE | SUBT. | FULLY'
for p in range(nPass):
line += ' | WEAK{p:d}'.format(p=p+1)
print line
line = '----------------------------------------------------'
for p in range(nPass):
line += '--------'
print line
for episode in sixEpisodes:
subs = ier_subs(reference[episode], weak_reference[episode])
full = ier_full(reference[episode], hypothesis_full[episode])
weak = [ier_weak[p](reference[episode], hypothesis_weak[p][episode]) for p in range(nPass)]
line = '{episode:s} | {subs:4.1f}% | {full:4.1f}%'.format(episode=episode,
full=100*full,
subs=100*subs)
for p in range(nPass):
line += ' | {weak:4.1f}%'.format(weak=100*weak[p])
print line
line = '----------------------------------------------------'
for p in range(nPass):
line += '--------'
print line
line = 'TOTAL | {subs:4.1f}% | {full:4.1f}%'.format(subs=100*abs(ier_subs),
full=100*abs(ier_full))
for p in range(nPass):
line += ' | {weak:4.1f}%'.format(weak=100*abs(ier_weak[p]))
print line