The MIT License (MIT)

Copyright (c) 2014 CNRS

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

AUTHORS
Hervé Bredin -- http://herve.niderb.fr

# setting up IPython Notebook
# for later pretty figures...
%pylab inline
pylab.rcParams['figure.figsize'] = (20.0, 5.0)
from pyannote.core import Annotation, Segment
from pyannote.core.notebook import set_notebook_crop
set_notebook_crop(Segment(0, 20))

Populating the interactive namespace from numpy and matplotlib

Fully- vs. Weakly-supervised Speaker Identification¶

from tvd import TheBigBangTheory
dataset = TheBigBangTheory('/Volumes/data/tvd/')
sixEpisodes = dataset.episodes[:6]
firstEpisode = dataset.episodes[0]


IN CASE YOU USE 'speaker' RESOURCES, PLEASE CONSIDER CITING:
@inproceedings{Tapaswi2012
    title = {{``Knock! Knock! Who is it?'' Probabilistic Person Identification in TV Series}},
    author = {Makarand Tapaswi and Martin B\"{a}uml and Rainer Stiefelhagen},
    booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
    year = {2012},
    month = {June},
}


IN CASE YOU USE 'outline' RESOURCES, PLEASE CONSIDER CITING:
@misc{the-big-bang-theory.com,
    title = {{The Big Bang Theory Wiki}},
    howpublished = \url{http://wiki.the-big-bang-theory.com/}
}


IN CASE YOU USE 'transcript' RESOURCES, PLEASE CONSIDER CITING:
@misc{bigbangtrans,
    title = {{big bang theory transcripts}},
    howpublished = \url{http://bigbangtrans.wordpress.com/}
}


IN CASE YOU USE 'transcript_www' RESOURCES, PLEASE CONSIDER CITING:
@misc{bigbangtrans,
    title = {{big bang theory transcripts}},
    howpublished = \url{http://bigbangtrans.wordpress.com/}
}


IN CASE YOU USE 'outline_www' RESOURCES, PLEASE CONSIDER CITING:
@misc{the-big-bang-theory.com,
    title = {{The Big Bang Theory Wiki}},
    howpublished = \url{http://wiki.the-big-bang-theory.com/}
}

targets = ['non_speech', 'SHELDON', 'LEONARD', 'RAJ', 'HOWARD', 'PENNY', 'OTHER']

Leave-one-out cross-validation¶

Experiments are done on the first six episodes of The Big Bang Theory using leave-one-out cross-validation.

def leaveOneOutCrossValidation():
    """(fiveEpisodes, oneEpisode) iterator"""
    for e, episode in enumerate(sixEpisodes):
        yield sixEpisodes[:e] + sixEpisodes[e+1:], episode

for train, test in leaveOneOutCrossValidation():
    print 'Train on episodes {', ' '.join([str(t.episode) for t in train]), '} / Test on episode', test.episode

Train on episodes { 2 3 4 5 6 } / Test on episode 1
Train on episodes { 1 3 4 5 6 } / Test on episode 2
Train on episodes { 1 2 4 5 6 } / Test on episode 3
Train on episodes { 1 2 3 5 6 } / Test on episode 4
Train on episodes { 1 2 3 4 6 } / Test on episode 5
Train on episodes { 1 2 3 4 5 } / Test on episode 6

Full supervision (manual annotation)¶

Precise speaker reference are obtained from The Big Bang Theory TVD subset.

SPEECH, NON_SPEECH = 'speech', 'non_speech'
reference = {}
for episode in sixEpisodes:
    manual_annotation = dataset.get_resource('speaker', episode)
    translation = {'music_titlesong': 'non_speech',
                   'silence': 'non_speech',
                   'sound_laugh': 'non_speech',
                   'sound_laughclap': 'non_speech',
                   'sound_other': 'non_speech',
                   'speech_howard': 'HOWARD',
                   'speech_leonard': 'LEONARD',
                   'speech_other': 'OTHER',
                   'speech_overlapping': 'OTHER',
                   'speech_penny': 'PENNY',
                   'speech_raj': 'RAJ',
                   'speech_sheldon': 'SHELDON'}
    reference[episode] = manual_annotation.translate(translation).smooth()

reference[firstEpisode]

Weak supervision (subtitles aligned with transcripts)¶

Subtitles timespans aligned with transcripts can be used as coarse annotation for training speaker identification.

from pyannote.parser.srt import SRTParser
parser = SRTParser(split=True,    # multi-speaker subtitles are split
                   duration=True) # their duration is interpolated based on their length (duration=True)
subtitles = {}
transcripts = {}
for episode in sixEpisodes:
    subtitles[episode] = parser.read(dataset.path_to_subtitles(episode))
    transcripts[episode] = dataset.get_resource('transcript', episode)

subtitles[firstEpisode]

transcripts[firstEpisode]

from pyannote.features.text.preprocessing import TextPreProcessing
from pyannote.features.text.tfidf import TFIDF
from pyannote.algorithms.alignment.transcription import TFIDFAlignment

# this is the default text (i.e. subtitles or speech transcript) pre-processing 
preprocessing = TextPreProcessing(
    tokenize=True,    # step 1: tokenization of sentences into words
    lemmatize=True,   # step 2: lemmatization
    stem=True,        # step 3: stemming
    stopwords=True,   # step 4: remove stop-words
    pos_tag=True,     
    keep_pos=True,    # step 5: only keep nouns, adjectives, verbs and adverbs
    min_length=2)     # step 6: remove stems shorter than 2 letters

# this is the TF-IDF transformer that will project 
# each pre-processed text into a fixed-size vector
tfidf = TFIDF(preprocessing=preprocessing,  # use just-defined pre-processing
              binary=True)                  # use binary term-frequency 
aligner = TFIDFAlignment(tfidf, adapt=True)

merged = aligner(transcripts[firstEpisode], subtitles[firstEpisode], vattribute='speech', hattribute='subtitle')

merged

# Build "weak" speaker identification reference based on subtitle timespans aligned with transcripts
weak_reference = {}

for episode in sixEpisodes:
    
    merged = aligner(transcripts[episode], subtitles[episode], vattribute='speech', hattribute='subtitle')
    
    annotation = Annotation(uri=episode)
    
    for start_time, end_time, data in merged.edges_iter(data=True):
        if 'speaker' in data:
            if data['speaker'] not in ['SHELDON', 'RAJ', 'PENNY', 'LEONARD', 'HOWARD']:
                label = 'OTHER'
            else:
                label = data['speaker']
            if start_time.anchored and end_time.anchored:
                annotation[Segment(start_time, end_time)] = label
        
    # then fill in the gaps with 'non_speech' segments
    extent = Segment(0, dataset.get_episode_duration(episode))
    for gap in annotation.get_timeline().gaps(extent):
        annotation[gap] = 'non_speech'
    
    weak_reference[episode] = annotation

TheBigBangTheory.Season01.Episode01
TheBigBangTheory.Season01.Episode02
TheBigBangTheory.Season01.Episode03
TheBigBangTheory.Season01.Episode04
TheBigBangTheory.Season01.Episode05
TheBigBangTheory.Season01.Episode06

weak_reference[firstEpisode]

Feature extraction¶

from pyannote.features.audio.yaafe import YaafeMFCC
yaafeMFCC = YaafeMFCC(e=False, De=True, DDe=True,  # energy first and second derivatives
                      coefs=13, D=True, DD=True)   # 13 MFCC coefficients + first and second derivatives
mfcc = {}
for episode in sixEpisodes:
    mfcc[episode] = yaafeMFCC.extract(dataset.path_to_audio(episode))

Fully supervised speaker identification¶

NCOMPONENTS = 64

from pyannote.algorithms.classification.hmm import ViterbiHMM

# we will store the results in these dictionary for each episode
hypothesis_full = {}

for trainOnFiveEpisodes, episode in leaveOneOutCrossValidation():
    
    # this is the episode we are testing
    # the five other episodes are used for training
    print episode
    
    # HMM structure
    # - diagonal covariance matrix 
    # - 250ms minimum duration in each state
    hmm = ViterbiHMM(targets=targets, 
                     min_duration={t: 0.250 for t in ['SHELDON', 'LEONARD', 'RAJ', 'HOWARD', 'PENNY', 'OTHER']},
                     sampling=500,
                     n_components=NCOMPONENTS, 
                     covariance_type='diag')
    
    # fully supervised (reference) training on five episodes
    hmm.fit([reference[e] for e in trainOnFiveEpisodes], 
            [mfcc[e] for e in trainOnFiveEpisodes])
    
    # testing (Viterbi decoding)
    hypothesis_full[episode] = hmm.apply(mfcc[episode])

TheBigBangTheory.Season01.Episode01
TheBigBangTheory.Season01.Episode02
TheBigBangTheory.Season01.Episode03
TheBigBangTheory.Season01.Episode04
TheBigBangTheory.Season01.Episode05
TheBigBangTheory.Season01.Episode06

hypothesis_full[firstEpisode]

Weakly supervised speaker identification¶

As a bonus (not in the paper), we add 2-pass weakly-supervised speaker identification

# we will store the results in these dictionary for each episode
nPass = 2
hypothesis_weak = [dict() for p in range(nPass)]

for trainOnFiveEpisodes, episode in leaveOneOutCrossValidation():

    print episode,
    
    # always contains the output of previous pass
    hypothesis_pass = {e: weak_reference[e] for e in trainOnFiveEpisodes}
    
    for p in range(nPass):
    
        print p+1,
        
        hmm = ViterbiHMM(targets=targets, 
                         min_duration={t: 0.250 for t in ['SHELDON', 'LEONARD', 'RAJ', 'HOWARD', 'PENNY', 'OTHER']},
                         sampling=500,
                         n_components=NCOMPONENTS, 
                         covariance_type='diag')
        
        hmm.fit([hypothesis_pass[e] for e in trainOnFiveEpisodes], 
                [mfcc[e] for e in trainOnFiveEpisodes])

        hypothesis_weak[p][episode] = hmm.apply(mfcc[episode])
        
        hypothesis_pass = {e: hmm.apply(mfcc[e]) for e in trainOnFiveEpisodes}
    
    print

TheBigBangTheory.Season01.Episode01 1 2
TheBigBangTheory.Season01.Episode02 1 2
TheBigBangTheory.Season01.Episode03 1 2
TheBigBangTheory.Season01.Episode04 1 2
TheBigBangTheory.Season01.Episode05 1 2
TheBigBangTheory.Season01.Episode06 1 2

hypothesis_full[firstEpisode]

hypothesis_weak[0][firstEpisode]

from pyannote.metrics.identification import IdentificationErrorRate
ier_full = IdentificationErrorRate()
ier_weak = [IdentificationErrorRate() for p in range(nPass)]
ier_subs = IdentificationErrorRate()


line = 'EPISODE                             | SUBT. | FULLY'
for p in range(nPass):
    line += ' | WEAK{p:d}'.format(p=p+1)
print line

line = '----------------------------------------------------'
for p in range(nPass):
    line += '--------'
print line

for episode in sixEpisodes:
    subs = ier_subs(reference[episode], weak_reference[episode])
    full = ier_full(reference[episode], hypothesis_full[episode])
    weak = [ier_weak[p](reference[episode], hypothesis_weak[p][episode]) for p in range(nPass)]
    line = '{episode:s} | {subs:4.1f}% | {full:4.1f}%'.format(episode=episode, 
                                                              full=100*full, 
                                                              subs=100*subs)
    for p in range(nPass):
        line += ' | {weak:4.1f}%'.format(weak=100*weak[p])
    print line

line = '----------------------------------------------------'
for p in range(nPass):
    line += '--------'
print line
    
line = 'TOTAL                               | {subs:4.1f}% | {full:4.1f}%'.format(subs=100*abs(ier_subs),
                                                                                  full=100*abs(ier_full))
for p in range(nPass):
    line += ' | {weak:4.1f}%'.format(weak=100*abs(ier_weak[p]))
print line

EPISODE                             | SUBT. | FULLY | WEAK1 | WEAK2
--------------------------------------------------------------------
TheBigBangTheory.Season01.Episode01 | 26.5% | 16.0% | 17.2% | 19.1%
TheBigBangTheory.Season01.Episode02 | 35.3% | 18.5% | 19.6% | 22.4%
TheBigBangTheory.Season01.Episode03 | 32.2% | 17.2% | 21.0% | 21.8%
TheBigBangTheory.Season01.Episode04 | 29.5% | 26.1% | 29.0% | 30.4%
TheBigBangTheory.Season01.Episode05 | 34.3% | 20.4% | 24.9% | 26.1%
TheBigBangTheory.Season01.Episode06 | 31.2% | 23.1% | 24.6% | 27.0%
--------------------------------------------------------------------
TOTAL                               | 31.4% | 20.1% | 22.6% | 24.3%