#!/usr/bin/env python # coding: utf-8 # Start by creating a new `conda` environment: # # ```bash # $ conda create -n pyannote python=3.6 anaconda # $ source activate pyannote # ``` # # Then, install `pyannote-video` and its dependencies: # # ```bash # $ pip install pyannote-video # ``` # # Finally, download sample video and `dlib` models: # # ```bash # $ git clone https://github.com/pyannote/pyannote-data.git # $ git clone https://github.com/davisking/dlib-models.git # $ bunzip2 dlib-models/dlib_face_recognition_resnet_model_v1.dat.bz2 # $ bunzip2 dlib-models/shape_predictor_68_face_landmarks.dat.bz2 # ``` # # To execute this notebook locally: # ```bash # $ git clone https://github.com/pyannote/pyannote-video.git # $ jupyter notebook --notebook-dir="pyannote-video/doc" # ``` # # In[4]: get_ipython().run_line_magic('pylab', 'inline') # # Shot segmentation # In[5]: get_ipython().system('pyannote-structure.py --help') # In[7]: get_ipython().system('pyannote-structure.py shot --verbose ../../pyannote-data/TheBigBangTheory.mkv ../../pyannote-data/TheBigBangTheory.shots.json') # Detected shot boundaries can be visualized using `pyannote.core` notebook support: # In[8]: from pyannote.core.json import load_from shots = load_from('../../pyannote-data/TheBigBangTheory.shots.json') shots # # Face processing # In[9]: get_ipython().system('pyannote-face.py --help') # ### Face tracking # In[10]: get_ipython().system('pyannote-face.py track --verbose --every=0.5 ../../pyannote-data/TheBigBangTheory.mkv ../../pyannote-data/TheBigBangTheory.shots.json ../../pyannote-data/TheBigBangTheory.track.txt') # Face tracks can be visualized using `demo` mode: # In[12]: get_ipython().system('pyannote-face.py demo ../../pyannote-data/TheBigBangTheory.mkv ../../pyannote-data/TheBigBangTheory.track.txt ../../pyannote-data/TheBigBangTheory.track.mp4') # In[14]: import io import base64 from IPython.display import HTML video = io.open('../../pyannote-data/TheBigBangTheory.track.mp4', 'r+b').read() encoded = base64.b64encode(video) HTML(data=''''''.format(encoded.decode('ascii'))) # ### Facial landmarks and face embedding # In[15]: get_ipython().system('pyannote-face.py extract --verbose ../../pyannote-data/TheBigBangTheory.mkv ../../pyannote-data/TheBigBangTheory.track.txt ../../dlib-models/shape_predictor_68_face_landmarks.dat ../../dlib-models/dlib_face_recognition_resnet_model_v1.dat ../../pyannote-data/TheBigBangTheory.landmarks.txt ../../pyannote-data/TheBigBangTheory.embedding.txt') # ### Face clustering # Once embeddings are extracted, let's apply face track hierarchical agglomerative clustering. # The distance between two clusters is defined as the average euclidean distance between all embeddings. # In[16]: from pyannote.video.face.clustering import FaceClustering clustering = FaceClustering(threshold=0.6) # In[17]: face_tracks, embeddings = clustering.model.preprocess('../../pyannote-data/TheBigBangTheory.embedding.txt') face_tracks.get_timeline() # In[18]: result = clustering(face_tracks, features=embeddings) # In[19]: from pyannote.core import notebook, Segment notebook.reset() notebook.crop = Segment(0, 30) mapping = {9: 'Leonard', 6: 'Sheldon', 14: 'Receptionist', 5: 'False_alarm'} result = result.rename_labels(mapping=mapping) result # In[21]: with open('../../pyannote-data/TheBigBangTheory.labels.txt', 'w') as fp: for _, track_id, cluster in result.itertracks(yield_label=True): fp.write(f'{track_id} {cluster}\n') # In[23]: get_ipython().system('pyannote-face.py demo ../../pyannote-data/TheBigBangTheory.mkv ../../pyannote-data/TheBigBangTheory.track.txt --label=../../pyannote-data/TheBigBangTheory.labels.txt ../../pyannote-data/TheBigBangTheory.final.mp4') # In[25]: import io import base64 from IPython.display import HTML video = io.open('../../pyannote-data/TheBigBangTheory.final.mp4', 'r+b').read() encoded = base64.b64encode(video) HTML(data=''''''.format(encoded.decode('ascii')))