diff --git a/DeepEncode.py b/DeepEncode.py index b42e998..b6befe5 100644 --- a/DeepEncode.py +++ b/DeepEncode.py @@ -1,91 +1,68 @@ import tensorflow as tf import numpy as np import cv2 -from video_compression_model import NUM_FRAMES, PRESET_SPEED_CATEGORIES, VideoCompressionModel +from video_compression_model import VideoCompressionModel # Constants -MAX_FRAMES = 24 CHUNK_SIZE = 24 # Adjust based on available memory and video resolution -COMPRESSED_VIDEO_FILE = 'compressed_video.mkv' +COMPRESSED_VIDEO_FILE = 'compressed_video.mp4' +MAX_FRAMES = 24 # Limit the number of frames processed +# Load the trained model +model = tf.keras.models.load_model('models/model.keras', custom_objects={'VideoCompressionModel': VideoCompressionModel}) -# Step 2: Load the trained model -model = tf.keras.models.load_model('models/model_differencing.keras', custom_objects={'VideoCompressionModel': VideoCompressionModel}) - -# Step 3: Load the uncompressed video +# Load the uncompressed video UNCOMPRESSED_VIDEO_FILE = 'test_data/training_video.mkv' -def load_frames_from_video(video_file, start_frame=0, num_frames=CHUNK_SIZE): +def load_frame_from_video(video_file, frame_num): cap = cv2.VideoCapture(video_file) - frames = [] - cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame) - - for _ in range(num_frames): - ret, frame = cap.read() - if not ret: - break - frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0 # Normalize and convert to float32 - frames.append(frame) + cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num) + ret, frame = cap.read() + if not ret: + return None + frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0 # Normalize and convert to float32 cap.release() - return frames - -def predict_in_chunks(uncompressed_frames, model, crf_values, preset_speed_values): - num_sequences = len(uncompressed_frames) - NUM_FRAMES + 1 - compressed_frames = [] - #for frame in uncompressed_frames: - # cv2.imshow("frame", frame) - # cv2.waitKey(50) + #display_frame = np.clip(frame * 255.0, 0, 255).astype(np.uint8) + #cv2.imshow("uncomp", display_frame) + #cv2.waitKey(0) # Add this line to hold the display window until a key is pressed - for start in range(0, num_sequences, CHUNK_SIZE): - end = min(start + CHUNK_SIZE, num_sequences) - frame_chunk = uncompressed_frames[start:end + NUM_FRAMES - 1] - crf_chunk = crf_values[start:end] - speed_chunk = preset_speed_values[start:end] - - frame_sequences = [] - for i in range(len(frame_chunk) - NUM_FRAMES + 1): - sequence = frame_chunk[i:i + NUM_FRAMES] - frame_sequences.append(sequence) - - frame_sequences = np.array(frame_sequences) - - compressed_chunk = model.predict({"frames": frame_sequences, "crf": crf_chunk, "preset_speed": speed_chunk}) - compressed_frames.extend(compressed_chunk) - - return compressed_frames - -def save_frames_chunk(frames, video_writer): - for frame in frames: - frame = np.clip(frame * 255.0, 0, 255).astype(np.uint8) - frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) - video_writer.write(frame) + + return frame +def predict_frame(uncompressed_frame, model, crf_value, preset_speed_value): + crf_array = np.array([crf_value]) + preset_speed_array = np.array([preset_speed_value]) + + compressed_frame = model.predict({ + "frame": np.array([uncompressed_frame]), + "crf": crf_array, + "preset_speed": preset_speed_array + }) + return compressed_frame[0] cap = cv2.VideoCapture(UNCOMPRESSED_VIDEO_FILE) total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) +cap.release() if MAX_FRAMES != 0 and total_frames > MAX_FRAMES: total_frames = MAX_FRAMES -cap.release() +crf_value = 25.0 # Example CRF value +preset_speed_value = 2 # Index for "fast" in our defined list -crf_values = np.full((CHUNK_SIZE + NUM_FRAMES - 1, 1), 25, dtype=np.float32) # Chunk size + look-ahead frames -preset_speed_index = PRESET_SPEED_CATEGORIES.index("fast") -preset_speed_values = np.full((CHUNK_SIZE + NUM_FRAMES - 1, 1), preset_speed_index, dtype=np.float32) +height, width = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)), int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) +fourcc = cv2.VideoWriter_fourcc(*'H264') +out = cv2.VideoWriter(COMPRESSED_VIDEO_FILE, fourcc, 24.0, (width, height)) -out = None # Video writer instance -for i in range(0, total_frames, CHUNK_SIZE): - uncompressed_frames_chunk = load_frames_from_video(UNCOMPRESSED_VIDEO_FILE, start_frame=i) - compressed_frames_chunk = predict_in_chunks(uncompressed_frames_chunk, model, crf_values, preset_speed_values) +for i in range(total_frames): + uncompressed_frame = load_frame_from_video(UNCOMPRESSED_VIDEO_FILE, frame_num=i) + compressed_frame = predict_frame(uncompressed_frame, model, crf_value, preset_speed_value) - # Initialize video writer if it's the first chunk - if out is None: - height, width = compressed_frames_chunk[0].shape[:2] - fourcc = cv2.VideoWriter_fourcc(*'XVID') - out = cv2.VideoWriter(COMPRESSED_VIDEO_FILE, fourcc, 24.0, (width, height)) - - save_frames_chunk(compressed_frames_chunk, out) + compressed_frame = np.clip(compressed_frame * 255.0, 0, 255).astype(np.uint8) + compressed_frame = cv2.cvtColor(compressed_frame, cv2.COLOR_RGB2BGR) + out.write(compressed_frame) + cv2.imshow("output", compressed_frame) out.release() print("Compression completed.") diff --git a/train_model.py b/train_model.py index bbef678..970eb33 100644 --- a/train_model.py +++ b/train_model.py @@ -1,16 +1,16 @@ import os import json -import tensorflow as tf import numpy as np import cv2 -from video_compression_model import NUM_CHANNELS, NUM_FRAMES, VideoCompressionModel, PRESET_SPEED_CATEGORIES +import tensorflow as tf +from video_compression_model import NUM_CHANNELS, VideoCompressionModel, PRESET_SPEED_CATEGORIES from tensorflow.keras.callbacks import EarlyStopping print(tf.config.list_physical_devices('GPU')) # Constants BATCH_SIZE = 8 -EPOCHS = 5 +EPOCHS = 50 TRAIN_SAMPLES = 5 def load_list(list_path): @@ -18,26 +18,18 @@ def load_list(list_path): video_details_list = json.load(json_file) return video_details_list -def load_frames_from_video(video_file, num_frames): - print("Extracting video frames...") +def load_frame_from_video(video_file): + print("Extracting video frame...") cap = cv2.VideoCapture(video_file) - frames = [] - count = 0 - while True: - ret, frame = cap.read() - if not ret: - break - frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) - frames.append(frame) - count += 1 - if count >= num_frames: - break + ret, frame = cap.read() + if not ret: + return None + frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) cap.release() - width, height = frame.shape[:2] - return frames, width, height + return frame -def preprocess(frames): - return np.array(frames) / 255.0 +def preprocess(frame): + return frame / 255.0 def save_model(model, file): os.makedirs("models", exist_ok=True) @@ -54,109 +46,62 @@ def load_video_from_list(list_path): PRESET_SPEED = PRESET_SPEED_CATEGORIES.index(video_details['preset_speed']) video_details['preset_speed'] = PRESET_SPEED - train_frames, w, h = load_frames_from_video(os.path.join("test_data/", VIDEO_FILE), NUM_FRAMES * TRAIN_SAMPLES) + frame = load_frame_from_video(os.path.join("test_data/", VIDEO_FILE)) - all_frames.extend(train_frames) - all_details.append({ - "frames": train_frames, - "width": w, - "height": h, - "crf": CRF, - "preset_speed": PRESET_SPEED, - "video_file": VIDEO_FILE - }) + if frame is not None: + all_frames.append(preprocess(frame)) + all_details.append({ + "frame": frame, + "crf": CRF, + "preset_speed": PRESET_SPEED, + "video_file": VIDEO_FILE + }) return all_details -def generate_frame_sequences(frames): - sequences = [] - labels = [] - for i in range(len(frames) - NUM_FRAMES + 1): - sequence = frames[i:i+NUM_FRAMES-1] - sequences.append(sequence) - labels.append(sequence[-1]) - return np.array(sequences), np.array(labels) - -def frame_difference(frames): - differences = [] - for i in range(1, len(frames)): - differences.append(cv2.absdiff(frames[i], frames[i-1])) - return differences - def main(): all_video_details_train = load_video_from_list("test_data/training.json") all_video_details_val = load_video_from_list("test_data/validation.json") - model = VideoCompressionModel(NUM_CHANNELS, NUM_FRAMES) + model = VideoCompressionModel(NUM_CHANNELS) model.compile(loss='mean_squared_error', optimizer='adam') early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1, restore_best_weights=True) - # Load and concatenate all sequences and labels - all_train_sequences = [] - all_val_sequences = [] - all_train_labels = [] - all_val_labels = [] + # Prepare data + all_train_frames = [] + all_val_frames = [] all_crf_train = [] all_crf_val = [] all_preset_speed_train = [] all_preset_speed_val = [] for video_details_train, video_details_val in zip(all_video_details_train, all_video_details_val): - train_frames = video_details_train["frames"] - val_frames = video_details_val["frames"] - - train_differences = frame_difference(preprocess(train_frames)) - val_differences = frame_difference(preprocess(val_frames)) - - #print(len(train_differences), train_differences[0].shape) - - train_sequences, train_labels = generate_frame_sequences(train_differences) - val_sequences, val_labels = generate_frame_sequences(val_differences) - - crf_array_train = np.full((len(train_sequences), 1), video_details_train['crf']) - crf_array_val = np.full((len(val_sequences), 1), video_details_val['crf']) - preset_speed_array_train = np.full((len(train_sequences), 1), video_details_train['preset_speed']) - preset_speed_array_val = np.full((len(val_sequences), 1), video_details_val['preset_speed']) - - all_train_sequences.extend(train_sequences) - all_val_sequences.extend(val_sequences) - all_train_labels.extend(train_labels) - all_val_labels.extend(val_labels) - all_crf_train.extend(crf_array_train) - all_crf_val.extend(crf_array_val) - all_preset_speed_train.extend(preset_speed_array_train) - all_preset_speed_val.extend(preset_speed_array_val) + all_train_frames.append(video_details_train["frame"]) + all_val_frames.append(video_details_val["frame"]) + all_crf_train.append(video_details_train['crf']) + all_crf_val.append(video_details_val['crf']) + all_preset_speed_train.append(video_details_train['preset_speed']) + all_preset_speed_val.append(video_details_val['preset_speed']) # Convert lists to numpy arrays - all_train_sequences = np.array(all_train_sequences) - all_val_sequences = np.array(all_val_sequences) - all_train_labels = np.array(all_train_labels) - all_val_labels = np.array(all_val_labels) + all_train_frames = np.array(all_train_frames) + all_val_frames = np.array(all_val_frames) all_crf_train = np.array(all_crf_train) all_crf_val = np.array(all_crf_val) all_preset_speed_train = np.array(all_preset_speed_train) all_preset_speed_val = np.array(all_preset_speed_val) - # Shuffle the training data - indices_train = np.arange(all_train_sequences.shape[0]) - np.random.shuffle(indices_train) - - all_train_sequences = all_train_sequences[indices_train] - all_train_labels = all_train_labels[indices_train] - all_crf_train = all_crf_train[indices_train] - all_preset_speed_train = all_preset_speed_train[indices_train] - - print("\nTraining the model on mixed sequences...") + print("\nTraining the model on frame pairs...") model.fit( - {"frames": all_train_sequences, "crf": all_crf_train, "preset_speed": all_preset_speed_train}, - all_train_labels, + {"frame": all_train_frames, "crf": all_crf_train, "preset_speed": all_preset_speed_train}, + all_val_frames, # Target is the compressed frame batch_size=BATCH_SIZE, epochs=EPOCHS, - validation_data=({"frames": all_val_sequences, "crf": all_crf_val, "preset_speed": all_preset_speed_val}, all_val_labels), + validation_data=({"frame": all_val_frames, "crf": all_crf_val, "preset_speed": all_preset_speed_val}, all_val_frames), callbacks=[early_stop] ) print("\nTraining completed!") - save_model(model, 'model_differencing.keras') + save_model(model, 'model.keras') if __name__ == "__main__": main() diff --git a/video_compression_model.py b/video_compression_model.py index a74753b..8dc7268 100644 --- a/video_compression_model.py +++ b/video_compression_model.py @@ -2,7 +2,6 @@ import tensorflow as tf PRESET_SPEED_CATEGORIES = ["ultrafast", "superfast", "veryfast", "faster", "fast", "medium", "slow", "slower", "veryslow"] NUM_PRESET_SPEEDS = len(PRESET_SPEED_CATEGORIES) -NUM_FRAMES = 5 # Number of consecutive frames in a sequence NUM_CHANNELS = 3 # Number of color channels in the video frames (RGB images have 3 channels) #policy = tf.keras.mixed_precision.Policy('mixed_float16') @@ -13,7 +12,6 @@ class VideoCompressionModel(tf.keras.Model): super(VideoCompressionModel, self).__init__() self.NUM_CHANNELS = NUM_CHANNELS - self.NUM_FRAMES = NUM_FRAMES # Regularization self.regularizer = tf.keras.regularizers.l2(regularization_factor) @@ -23,21 +21,24 @@ class VideoCompressionModel(tf.keras.Model): # Encoder layers self.encoder = tf.keras.Sequential([ - tf.keras.layers.Conv3D(32, (3, 3, 3), activation='relu', padding='same', input_shape=(None, None, None, NUM_CHANNELS + 1 + 16), kernel_regularizer=self.regularizer), - tf.keras.layers.MaxPooling3D((2, 2, 2)), + tf.keras.layers.ZeroPadding2D(padding=((1, 1), (1, 1))), # Padding to preserve spatial dimensions + tf.keras.layers.Conv2D(32, (3, 3), activation='relu', padding='same', kernel_regularizer=self.regularizer), + tf.keras.layers.MaxPooling2D((2, 2)), # Add more encoder layers as needed ]) # Decoder layers self.decoder = tf.keras.Sequential([ - tf.keras.layers.Conv3DTranspose(32, (3, 3, 3), activation='relu', padding='same', kernel_regularizer=self.regularizer), - tf.keras.layers.UpSampling3D((2, 2, 2)), + tf.keras.layers.Conv2DTranspose(32, (3, 3), activation='relu', padding='same', kernel_regularizer=self.regularizer), + tf.keras.layers.UpSampling2D((2, 2)), # Add more decoder layers as needed - tf.keras.layers.Conv3D(NUM_CHANNELS, (3, 3, 3), activation='sigmoid', padding='same', kernel_regularizer=self.regularizer) # Output layer for video frames + tf.keras.layers.Conv2D(NUM_CHANNELS, (3, 3), activation='sigmoid', padding='same', kernel_regularizer=self.regularizer), # Output layer for video frames + tf.keras.layers.Cropping2D(cropping=((1, 1), (1, 1))) # Adjust cropping to ensure dimensions match + ]) def call(self, inputs): - frames = inputs["frames"] + frame = inputs["frame"] crf = tf.expand_dims(inputs["crf"], -1) preset_speed = inputs["preset_speed"] @@ -46,15 +47,15 @@ class VideoCompressionModel(tf.keras.Model): preset_embedding = tf.keras.layers.Flatten()(preset_embedding) # Concatenate crf and preset_embedding to frames - frames_shape = tf.shape(frames) - repeated_crf = tf.tile(tf.reshape(crf, (-1, 1, 1, 1, 1)), [1, frames_shape[1], frames_shape[2], frames_shape[3], 1]) - repeated_preset = tf.tile(tf.reshape(preset_embedding, (-1, 1, 1, 1, 16)), [1, frames_shape[1], frames_shape[2], frames_shape[3], 1]) + frame_shape = tf.shape(frame) + repeated_crf = tf.tile(tf.reshape(crf, (-1, 1, 1, 1)), [1, frame_shape[1], frame_shape[2], 1]) + repeated_preset = tf.tile(tf.reshape(preset_embedding, (-1, 1, 1, 16)), [1, frame_shape[1], frame_shape[2], 1]) - frames = tf.concat([frames, repeated_crf, repeated_preset], axis=-1) + frame = tf.concat([tf.cast(frame, tf.float32), repeated_crf, repeated_preset], axis=-1) - # Encoding the video frames - compressed_representation = self.encoder(frames) + # Encoding the frame + compressed_representation = self.encoder(frame) - # Decoding to generate compressed video frames - reconstructed_frames = self.decoder(compressed_representation) - return reconstructed_frames[:,-1,:,:,:] + # Decoding to generate compressed frame + reconstructed_frame = self.decoder(compressed_representation) + return reconstructed_frame