The MIT License (MIT)

Copyright (c) 2014 CNRS

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

AUTHORS
Hervé Bredin -- http://herve.niderb.fr

# setting up IPython Notebook
# for later pretty figures...
%pylab inline
pylab.rcParams['figure.figsize'] = (20.0, 5.0)
from pyannote.core import Annotation, Segment
from pyannote.core.notebook import set_notebook_crop
set_notebook_crop(Segment(0, 20))

Populating the interactive namespace from numpy and matplotlib

Fully- vs. Weakly-supervised Speech Activity Detection¶

from tvd import TheBigBangTheory
dataset = TheBigBangTheory('/Volumes/data/tvd/')
sixEpisodes = dataset.episodes[:6]
firstEpisode = dataset.episodes[0]


IN CASE YOU USE 'speaker' RESOURCES, PLEASE CONSIDER CITING:
@inproceedings{Tapaswi2012
    title = {{``Knock! Knock! Who is it?'' Probabilistic Person Identification in TV Series}},
    author = {Makarand Tapaswi and Martin B\"{a}uml and Rainer Stiefelhagen},
    booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
    year = {2012},
    month = {June},
}


IN CASE YOU USE 'outline' RESOURCES, PLEASE CONSIDER CITING:
@misc{the-big-bang-theory.com,
    title = {{The Big Bang Theory Wiki}},
    howpublished = \url{http://wiki.the-big-bang-theory.com/}
}


IN CASE YOU USE 'transcript' RESOURCES, PLEASE CONSIDER CITING:
@misc{bigbangtrans,
    title = {{big bang theory transcripts}},
    howpublished = \url{http://bigbangtrans.wordpress.com/}
}


IN CASE YOU USE 'transcript_www' RESOURCES, PLEASE CONSIDER CITING:
@misc{bigbangtrans,
    title = {{big bang theory transcripts}},
    howpublished = \url{http://bigbangtrans.wordpress.com/}
}


IN CASE YOU USE 'outline_www' RESOURCES, PLEASE CONSIDER CITING:
@misc{the-big-bang-theory.com,
    title = {{The Big Bang Theory Wiki}},
    howpublished = \url{http://wiki.the-big-bang-theory.com/}
}

Leave-one-out cross-validation¶

Experiments are done on the first six episodes of The Big Bang Theory using leave-one-out cross-validation.

def leaveOneOutCrossValidation():
    """(fiveEpisodes, oneEpisode) iterator"""
    for e, episode in enumerate(sixEpisodes):
        yield sixEpisodes[:e] + sixEpisodes[e+1:], episode

for train, test in leaveOneOutCrossValidation():
    print 'Train on episodes {', ' '.join([str(t.episode) for t in train]), '} / Test on episode', test.episode

Train on episodes { 2 3 4 5 6 } / Test on episode 1
Train on episodes { 1 3 4 5 6 } / Test on episode 2
Train on episodes { 1 2 4 5 6 } / Test on episode 3
Train on episodes { 1 2 3 5 6 } / Test on episode 4
Train on episodes { 1 2 3 4 6 } / Test on episode 5
Train on episodes { 1 2 3 4 5 } / Test on episode 6

Full supervision (manual annotation)¶

Precise speech/non-speech reference are obtained from The Big Bang Theory TVD subset.

SPEECH, NON_SPEECH = 'speech', 'non_speech'
reference = {}
for episode in sixEpisodes:
    manual_annotation = dataset.get_resource('speaker', episode)
    translation = {label: SPEECH if label[:7] == 'speech_' else NON_SPEECH for label in manual_annotation.labels()}
    reference[episode] = manual_annotation.translate(translation).smooth()

reference[firstEpisode]

Weak supervision (subtitles)¶

Subtitles timespans can be used as coarse annotation for training speech activity detection

from pyannote.parser.srt import SRTParser
parser = SRTParser(split=True,    # multi-speaker subtitles are split
                   duration=True) # their duration is interpolated based on their length (duration=True)
subtitles = {}
for episode in sixEpisodes:
    subtitles[episode] = parser.read(dataset.path_to_subtitles(episode))

subtitles[firstEpisode]

# Build "weak" speech/non-speech reference based on subtitle timespans
weak_reference = {}

for episode in sixEpisodes:
    
    # start by adding 'speech' segments
    annotation = Annotation(uri=episode)
    for start_time, end_time, edge_data in subtitles[episode].ordered_edges_iter(data=True):
        if 'subtitle' in edge_data:
            annotation[Segment(start_time, end_time)] = SPEECH
    
    # then fill in the gaps with 'non_speech' segments
    extent = Segment(0, dataset.get_episode_duration(episode))
    for gap in annotation.get_timeline().gaps(extent):
        annotation[gap] = NON_SPEECH
    
    weak_reference[episode] = annotation

weak_reference[firstEpisode]

Feature extraction¶

from pyannote.features.audio.yaafe import YaafeMFCC
yaafeMFCC = YaafeMFCC(e=False, De=True, DDe=False,  # delta energy
                      coefs=12, D=True, DD=False)   # 12 MFCC coefficients + first derivative
mfcc = {}
for episode in sixEpisodes:
    mfcc[episode] = yaafeMFCC.extract(dataset.path_to_audio(episode))

Fully supervised speech activity detection¶

SPEECH_MINDURATION = 0.250     
NONSPEECH_MINDURATION = 0.250
NCOMPONENTS = 16  # 16 gaussians per state

from pyannote.algorithms.classification.hmm import ViterbiHMM

# we will store the results in these dictionary for each episode
hypothesis_full = {}

for trainOnFiveEpisodes, episode in leaveOneOutCrossValidation():
    
    # this is the episode we are testing
    # the five other episodes are used for training
    print episode
    
    # HMM structure
    # - 16 GMM per state 
    # - diagonal covariance matrix 
    # - 250ms minimum duration in each state
    hmm = ViterbiHMM(targets=[SPEECH, NON_SPEECH], 
                     min_duration={SPEECH: SPEECH_MINDURATION, NON_SPEECH: NONSPEECH_MINDURATION},
                     sampling=500,
                     n_components=NCOMPONENTS, 
                     covariance_type='diag')
    
    # fully supervised (reference) training on five episodes
    hmm.fit([reference[e] for e in trainOnFiveEpisodes], 
            [mfcc[e] for e in trainOnFiveEpisodes])
    
    # testing (Viterbi decoding)
    hypothesis_full[episode] = hmm.apply(mfcc[episode])

TheBigBangTheory.Season01.Episode01
TheBigBangTheory.Season01.Episode02
TheBigBangTheory.Season01.Episode03
TheBigBangTheory.Season01.Episode04
TheBigBangTheory.Season01.Episode05
TheBigBangTheory.Season01.Episode06

hypothesis_full[firstEpisode]

Weakly supervised speech activity detection¶

As a bonus (not in the paper), we add 2-pass weakly-supervised speech activity detection.

# we will store the results in these dictionary for each episode
nPass = 2
hypothesis_weak = [dict() for p in range(nPass)]

for trainOnFiveEpisodes, episode in leaveOneOutCrossValidation():

    print episode,
    
    # always contains the output of previous pass
    hypothesis_pass = {e: weak_reference[e] for e in trainOnFiveEpisodes}
    
    for p in range(nPass):
    
        print p+1,
        
        hmm = ViterbiHMM(targets=[SPEECH, NON_SPEECH], 
                         min_duration={SPEECH: SPEECH_MINDURATION, NON_SPEECH: NONSPEECH_MINDURATION},
                         # smooth=SMOOTH,
                         sampling=500,
                         n_components=NCOMPONENTS, 
                         covariance_type='diag')
        
        hmm.fit([hypothesis_pass[e] for e in trainOnFiveEpisodes], 
                [mfcc[e] for e in trainOnFiveEpisodes])

        hypothesis_weak[p][episode] = hmm.apply(mfcc[episode])
        
        hypothesis_pass = {e: hmm.apply(mfcc[e]) for e in trainOnFiveEpisodes}
    
    print

TheBigBangTheory.Season01.Episode01 1 2
TheBigBangTheory.Season01.Episode02 1 2
TheBigBangTheory.Season01.Episode03 1 2
TheBigBangTheory.Season01.Episode04 1 2
TheBigBangTheory.Season01.Episode05 1 2
TheBigBangTheory.Season01.Episode06 1 2

hypothesis_full[firstEpisode]

hypothesis_weak[0][firstEpisode]

from pyannote.metrics.identification import IdentificationErrorRate
ier_full = IdentificationErrorRate(collar=0.050)
ier_weak = [IdentificationErrorRate(collar=0.050) for p in range(nPass)]
ier_subs = IdentificationErrorRate(collar=0.050)


line = 'EPISODE                             | SUBT. | FULLY'
for p in range(nPass):
    line += ' | WEAK{p:d}'.format(p=p+1)
print line

line = '----------------------------------------------------'
for p in range(nPass):
    line += '--------'
print line

for episode in sixEpisodes:
    subs = ier_subs(reference[episode], weak_reference[episode])
    full = ier_full(reference[episode], hypothesis_full[episode])
    weak = [ier_weak[p](reference[episode], hypothesis_weak[p][episode]) for p in range(nPass)]
    line = '{episode:s} | {subs:4.1f}% | {full:4.1f}%'.format(episode=episode, 
                                                              full=100*full, 
                                                              subs=100*subs)
    for p in range(nPass):
        line += ' | {weak:4.1f}%'.format(weak=100*weak[p])
    print line

line = '----------------------------------------------------'
for p in range(nPass):
    line += '--------'
print line
    
line = 'TOTAL                               | {subs:4.1f}% | {full:4.1f}%'.format(subs=100*abs(ier_subs),
                                                                                  full=100*abs(ier_full))
for p in range(nPass):
    line += ' | {weak:4.1f}%'.format(weak=100*abs(ier_weak[p]))
print line

EPISODE                             | SUBT. | FULLY | WEAK1 | WEAK2
--------------------------------------------------------------------
TheBigBangTheory.Season01.Episode01 | 19.0% |  7.2% |  7.2% |  7.1%
TheBigBangTheory.Season01.Episode02 | 19.4% |  6.5% |  7.1% |  6.9%
TheBigBangTheory.Season01.Episode03 | 20.1% |  7.1% |  8.5% |  7.5%
TheBigBangTheory.Season01.Episode04 | 17.7% |  5.3% |  6.1% |  5.5%
TheBigBangTheory.Season01.Episode05 | 20.5% |  6.0% |  7.4% |  6.4%
TheBigBangTheory.Season01.Episode06 | 17.5% | 10.8% |  8.8% |  9.8%
--------------------------------------------------------------------
TOTAL                               | 19.0% |  7.2% |  7.5% |  7.2%