The MIT License (MIT)
Copyright (c) 2014 CNRS
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
AUTHORS
Hervé Bredin -- http://herve.niderb.fr
# setting up IPython Notebook
# for later pretty figures...
%pylab inline
pylab.rcParams['figure.figsize'] = (20.0, 5.0)
from pyannote.core import Annotation, Segment
from pyannote.core.notebook import set_notebook_crop
set_notebook_crop(Segment(0, 20))
from tvd import TheBigBangTheory
dataset = TheBigBangTheory('/Volumes/data/tvd/')
sixEpisodes = dataset.episodes[:6]
firstEpisode = dataset.episodes[0]
Experiments are done on the first six episodes of The Big Bang Theory using leave-one-out cross-validation.
def leaveOneOutCrossValidation():
"""(fiveEpisodes, oneEpisode) iterator"""
for e, episode in enumerate(sixEpisodes):
yield sixEpisodes[:e] + sixEpisodes[e+1:], episode
for train, test in leaveOneOutCrossValidation():
print 'Train on episodes {', ' '.join([str(t.episode) for t in train]), '} / Test on episode', test.episode
Precise speech/non-speech reference are obtained from The Big Bang Theory TVD subset.
SPEECH, NON_SPEECH = 'speech', 'non_speech'
reference = {}
for episode in sixEpisodes:
manual_annotation = dataset.get_resource('speaker', episode)
translation = {label: SPEECH if label[:7] == 'speech_' else NON_SPEECH for label in manual_annotation.labels()}
reference[episode] = manual_annotation.translate(translation).smooth()
reference[firstEpisode]
Subtitles timespans can be used as coarse annotation for training speech activity detection
from pyannote.parser.srt import SRTParser
parser = SRTParser(split=True, # multi-speaker subtitles are split
duration=True) # their duration is interpolated based on their length (duration=True)
subtitles = {}
for episode in sixEpisodes:
subtitles[episode] = parser.read(dataset.path_to_subtitles(episode))
subtitles[firstEpisode]
# Build "weak" speech/non-speech reference based on subtitle timespans
weak_reference = {}
for episode in sixEpisodes:
# start by adding 'speech' segments
annotation = Annotation(uri=episode)
for start_time, end_time, edge_data in subtitles[episode].ordered_edges_iter(data=True):
if 'subtitle' in edge_data:
annotation[Segment(start_time, end_time)] = SPEECH
# then fill in the gaps with 'non_speech' segments
extent = Segment(0, dataset.get_episode_duration(episode))
for gap in annotation.get_timeline().gaps(extent):
annotation[gap] = NON_SPEECH
weak_reference[episode] = annotation
weak_reference[firstEpisode]
from pyannote.features.audio.yaafe import YaafeMFCC
yaafeMFCC = YaafeMFCC(e=False, De=True, DDe=False, # delta energy
coefs=12, D=True, DD=False) # 12 MFCC coefficients + first derivative
mfcc = {}
for episode in sixEpisodes:
mfcc[episode] = yaafeMFCC.extract(dataset.path_to_audio(episode))
SPEECH_MINDURATION = 0.250
NONSPEECH_MINDURATION = 0.250
NCOMPONENTS = 16 # 16 gaussians per state
from pyannote.algorithms.classification.hmm import ViterbiHMM
# we will store the results in these dictionary for each episode
hypothesis_full = {}
for trainOnFiveEpisodes, episode in leaveOneOutCrossValidation():
# this is the episode we are testing
# the five other episodes are used for training
print episode
# HMM structure
# - 16 GMM per state
# - diagonal covariance matrix
# - 250ms minimum duration in each state
hmm = ViterbiHMM(targets=[SPEECH, NON_SPEECH],
min_duration={SPEECH: SPEECH_MINDURATION, NON_SPEECH: NONSPEECH_MINDURATION},
sampling=500,
n_components=NCOMPONENTS,
covariance_type='diag')
# fully supervised (reference) training on five episodes
hmm.fit([reference[e] for e in trainOnFiveEpisodes],
[mfcc[e] for e in trainOnFiveEpisodes])
# testing (Viterbi decoding)
hypothesis_full[episode] = hmm.apply(mfcc[episode])
hypothesis_full[firstEpisode]
As a bonus (not in the paper), we add 2-pass weakly-supervised speech activity detection.
# we will store the results in these dictionary for each episode
nPass = 2
hypothesis_weak = [dict() for p in range(nPass)]
for trainOnFiveEpisodes, episode in leaveOneOutCrossValidation():
print episode,
# always contains the output of previous pass
hypothesis_pass = {e: weak_reference[e] for e in trainOnFiveEpisodes}
for p in range(nPass):
print p+1,
hmm = ViterbiHMM(targets=[SPEECH, NON_SPEECH],
min_duration={SPEECH: SPEECH_MINDURATION, NON_SPEECH: NONSPEECH_MINDURATION},
# smooth=SMOOTH,
sampling=500,
n_components=NCOMPONENTS,
covariance_type='diag')
hmm.fit([hypothesis_pass[e] for e in trainOnFiveEpisodes],
[mfcc[e] for e in trainOnFiveEpisodes])
hypothesis_weak[p][episode] = hmm.apply(mfcc[episode])
hypothesis_pass = {e: hmm.apply(mfcc[e]) for e in trainOnFiveEpisodes}
print
hypothesis_full[firstEpisode]
hypothesis_weak[0][firstEpisode]
from pyannote.metrics.identification import IdentificationErrorRate
ier_full = IdentificationErrorRate(collar=0.050)
ier_weak = [IdentificationErrorRate(collar=0.050) for p in range(nPass)]
ier_subs = IdentificationErrorRate(collar=0.050)
line = 'EPISODE | SUBT. | FULLY'
for p in range(nPass):
line += ' | WEAK{p:d}'.format(p=p+1)
print line
line = '----------------------------------------------------'
for p in range(nPass):
line += '--------'
print line
for episode in sixEpisodes:
subs = ier_subs(reference[episode], weak_reference[episode])
full = ier_full(reference[episode], hypothesis_full[episode])
weak = [ier_weak[p](reference[episode], hypothesis_weak[p][episode]) for p in range(nPass)]
line = '{episode:s} | {subs:4.1f}% | {full:4.1f}%'.format(episode=episode,
full=100*full,
subs=100*subs)
for p in range(nPass):
line += ' | {weak:4.1f}%'.format(weak=100*weak[p])
print line
line = '----------------------------------------------------'
for p in range(nPass):
line += '--------'
print line
line = 'TOTAL | {subs:4.1f}% | {full:4.1f}%'.format(subs=100*abs(ier_subs),
full=100*abs(ier_full))
for p in range(nPass):
line += ' | {weak:4.1f}%'.format(weak=100*abs(ier_weak[p]))
print line