# git clone
import os
import numpy as np
from scipy.interpolate import interp1d
from import wavfile
import matplotlib.pyplot as plt
import PIL.Image
import moviepy.editor
import dnnlib
import dnnlib.tflib as tflib
import pretrained_networks
audio = {}
fps = 60
for mp3_filename in [f for f in os.listdir('data') if f.endswith('.mp3')]:
mp3_filename = f'data/{mp3_filename}'
wav_filename = mp3_filename[:-4] + '.wav'
if not os.path.exists(wav_filename):
audio_clip = moviepy.editor.AudioFileClip(mp3_filename)
audio_clip.write_audiofile(wav_filename, fps=44100, nbytes=2, codec='pcm_s16le')
track_name = os.path.basename(wav_filename)[15:-5]
rate, signal =
signal = np.mean(signal, axis=1) # to mono
signal = np.abs(signal)
seed = signal.shape[0]
duration = signal.shape[0] / rate
frames = int(np.ceil(duration * fps))
samples_per_frame = signal.shape[0] / frames
audio[track_name] = np.zeros(frames, dtype=signal.dtype)
for frame in range(frames):
start = int(round(frame * samples_per_frame))
stop = int(round((frame + 1) * samples_per_frame))
audio[track_name][frame] = np.mean(signal[start:stop], axis=0)
audio[track_name] /= max(audio[track_name])
for track in sorted(audio.keys()):
plt.figure(figsize=(8, 3))
network_pkl = 'gdrive:networks/stylegan2-ffhq-config-f.pkl'
_G, _D, Gs = pretrained_networks.load_networks(network_pkl)
Gs_kwargs = dnnlib.EasyDict()
Gs_kwargs.output_transform = dict(func=tflib.convert_images_to_uint8, nchw_to_nhwc=True)
Gs_kwargs.randomize_noise = False
Gs_syn_kwargs = dnnlib.EasyDict()
Gs_syn_kwargs.output_transform = dict(func=tflib.convert_images_to_uint8, nchw_to_nhwc=True)
Gs_syn_kwargs.randomize_noise = False
Gs_syn_kwargs.minibatch_size = 4
noise_vars = [var for name, var in Gs.components.synthesis.vars.items() if name.startswith('noise')]
w_avg = Gs.get_var('dlatent_avg')
def get_ws(n, frames, seed):
filename = f'data/ws_{n}_{frames}_{seed}.npy'
if not os.path.exists(filename):
src_ws = np.random.RandomState(seed).randn(n, 512)
ws = np.empty((frames, 512))
for i in range(512):
# FIXME: retarded
x = np.linspace(0, 3*frames, 3*len(src_ws), endpoint=False)
y = np.tile(src_ws[:, i], 3)
x_ = np.linspace(0, 3*frames, 3*frames, endpoint=False)
y_ = interp1d(x, y, kind='quadratic', fill_value='extrapolate')(x_)
ws[:, i] = y_[frames:2*frames], ws)
ws = np.load(filename)
return ws
def mix_styles(wa, wb, ivs):
w = np.copy(wa)
for i, v in ivs:
w[i] = wa[i] * (1 - v) + wb[i] * v
return w
def normalize_vector(v):
return v * np.std(w_avg) / np.std(v) + np.mean(w_avg) - np.mean(v)
def render_frame(t):
global base_index
frame = np.clip( * fps)), 0, frames - 1)
base_index += base_speed * audio['Instrumental'][frame]**2
base_w = base_ws[int(round(base_index)) % len(base_ws)]
base_w = np.tile(base_w, (18, 1))
psi = 0.5 + audio['FX'][frame] / 2
base_w = w_avg + (base_w - w_avg) * psi
mix_w = np.tile(mix_ws[frame], (18, 1))
mix_w = w_avg + (mix_w - w_avg) * 0.75
ranges = [range(0, 4), range(4, 8), range(8, 18)]
values = [audio[track][frame] for track in ['Drums', 'E Drums', 'Synth']]
w = mix_styles(base_w, mix_w, zip(ranges, values))
w += mouth_open * audio['Vocal'][frame] * 1.5
image =[w]), **Gs_syn_kwargs)[0]
image = PIL.Image.fromarray(image).resize((size, size), PIL.Image.LANCZOS)
return np.array(image)
size = 1080
seconds = int(np.ceil(duration))
resolution = 10
base_frames = resolution * frames
base_ws = get_ws(seconds, base_frames, seed)
base_speed = base_frames / sum(audio['Instrumental']**2)
base_index = 0
mix_ws = get_ws(seconds, frames, seed + 1)
mouth_open = normalize_vector(-np.load('data/mouth_ratio.npy'))
mp4_filename = 'data/Culture Shock.mp4'
video_clip = moviepy.editor.VideoClip(render_frame, duration=duration)
audio_clip_i = moviepy.editor.AudioFileClip('data/Culture Shock (Instrumental).wav')
audio_clip_v = moviepy.editor.AudioFileClip('data/Culture Shock (Vocal).wav')
audio_clip = moviepy.editor.CompositeAudioClip([audio_clip_i, audio_clip_v])
video_clip = video_clip.set_audio(audio_clip)
video_clip.write_videofile(mp4_filename, fps=fps, codec='libx264', audio_codec='aac', bitrate='8M')
rolux commented Jan 14, 2020

The most obvious parameters to play with are the psi values in lines 91 and 93, and the mouth_open multiplier.

Also, there are more vectors to try out.

Hi how did you set audio['Instrumental'] or ['FX'] from the wavfile? I am getting KeyError, and I am not seeing how you set those keys from the code. Maybe I am missing something.

rolux commented Jan 20, 2020

@swapp1990: You're missing the audio files! Culture Shock (Drums).mp3, Culture Shock (E Drums).mp3, etc.

aisdn commented Jan 21, 2020

me to:

Setting up TensorFlow plugin "": Preprocessing... Loading... Done.
Setting up TensorFlow plugin "": Preprocessing... Loading... Done.
Traceback (most recent call last):
File "", line 107, in
base_speed = base_frames / sum(audio['Instrumental']**2)
KeyError: 'Instrumental'

rolux commented Jan 21, 2020

@zncook: See my comment above!

aisdn commented Jan 21, 2020

@zncook: See my comment above!

Hi. Where download these audio files? Culture Shock (Drums).mp3, Culture Shock (E Drums).mp3, etc.

can I customize these audio files?

rolux commented Jan 21, 2020

@zncook: as it says above:

Of course, you can provide your own files, split them with, etc.

what python you use? stylegun2 is for 3.6 or later but PIL is for 2.7 or less

what python you use? stylegun2 is for 3.6 or later but PIL is for 2.7 or less

@nikitatishin5 install pillow instead of PIL and use python 3.6

if I want to combine another parameter to the drums for example, how should I do it? I already downloaded other vectors but I can't seem to make it work properly

