test

2023-07-26 01:04:22 +01:00 · 2023-07-26 01:04:22 +01:00 · 5085c87300
commit 5085c87300
parent 8c5001166d
3 changed files with 96 additions and 173 deletions
--- a/DeepEncode.py
+++ b/DeepEncode.py
@ -1,91 +1,68 @@
 import tensorflow as tf
 import numpy as np
 import cv2
-from video_compression_model import NUM_FRAMES, PRESET_SPEED_CATEGORIES, VideoCompressionModel
+from video_compression_model import VideoCompressionModel

 # Constants
-MAX_FRAMES = 24
 CHUNK_SIZE = 24  # Adjust based on available memory and video resolution
-COMPRESSED_VIDEO_FILE = 'compressed_video.mkv'
+COMPRESSED_VIDEO_FILE = 'compressed_video.mp4'
+MAX_FRAMES = 24  # Limit the number of frames processed

+# Load the trained model
+model = tf.keras.models.load_model('models/model.keras', custom_objects={'VideoCompressionModel': VideoCompressionModel})

-# Step 2: Load the trained model
-model = tf.keras.models.load_model('models/model_differencing.keras', custom_objects={'VideoCompressionModel': VideoCompressionModel})
-
-# Step 3: Load the uncompressed video
+# Load the uncompressed video
 UNCOMPRESSED_VIDEO_FILE = 'test_data/training_video.mkv'

-def load_frames_from_video(video_file, start_frame=0, num_frames=CHUNK_SIZE):
+def load_frame_from_video(video_file, frame_num):
    cap = cv2.VideoCapture(video_file)
-    frames = []
-    cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
-    
-    for _ in range(num_frames):
-        ret, frame = cap.read()
-        if not ret:
-            break
-        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0  # Normalize and convert to float32
-        frames.append(frame)
+    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
+    ret, frame = cap.read()
+    if not ret:
+        return None
+    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0  # Normalize and convert to float32
    cap.release()
-    return frames
-
-def predict_in_chunks(uncompressed_frames, model, crf_values, preset_speed_values):
-    num_sequences = len(uncompressed_frames) - NUM_FRAMES + 1
-    compressed_frames = []
    
-    #for frame in uncompressed_frames:
-    #    cv2.imshow("frame", frame)
-    #    cv2.waitKey(50)
+    #display_frame = np.clip(frame * 255.0, 0, 255).astype(np.uint8)
+    #cv2.imshow("uncomp", display_frame)
+    #cv2.waitKey(0)  # Add this line to hold the display window until a key is pressed

-    for start in range(0, num_sequences, CHUNK_SIZE):
-        end = min(start + CHUNK_SIZE, num_sequences)
-        frame_chunk = uncompressed_frames[start:end + NUM_FRAMES - 1]
-        crf_chunk = crf_values[start:end]
-        speed_chunk = preset_speed_values[start:end]
-
-        frame_sequences = []
-        for i in range(len(frame_chunk) - NUM_FRAMES + 1):
-            sequence = frame_chunk[i:i + NUM_FRAMES]
-            frame_sequences.append(sequence)
-            
-        frame_sequences = np.array(frame_sequences)
-
-        compressed_chunk = model.predict({"frames": frame_sequences, "crf": crf_chunk, "preset_speed": speed_chunk})
-        compressed_frames.extend(compressed_chunk)
-        
-    return compressed_frames
-
-def save_frames_chunk(frames, video_writer):
-    for frame in frames:
-        frame = np.clip(frame * 255.0, 0, 255).astype(np.uint8)
-        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
-        video_writer.write(frame)
+    
+    return frame

+def predict_frame(uncompressed_frame, model, crf_value, preset_speed_value):
+    crf_array = np.array([crf_value])
+    preset_speed_array = np.array([preset_speed_value])
+    
+    compressed_frame = model.predict({
+        "frame": np.array([uncompressed_frame]), 
+        "crf": crf_array, 
+        "preset_speed": preset_speed_array
+    })
+    return compressed_frame[0]

 cap = cv2.VideoCapture(UNCOMPRESSED_VIDEO_FILE)
 total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+cap.release()

 if MAX_FRAMES != 0 and total_frames > MAX_FRAMES:
    total_frames = MAX_FRAMES

-cap.release()
+crf_value = 25.0  # Example CRF value
+preset_speed_value = 2  # Index for "fast" in our defined list

-crf_values = np.full((CHUNK_SIZE + NUM_FRAMES - 1, 1), 25, dtype=np.float32)  # Chunk size + look-ahead frames
-preset_speed_index = PRESET_SPEED_CATEGORIES.index("fast")
-preset_speed_values = np.full((CHUNK_SIZE + NUM_FRAMES - 1, 1), preset_speed_index, dtype=np.float32)
+height, width = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)), int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+fourcc = cv2.VideoWriter_fourcc(*'H264')
+out = cv2.VideoWriter(COMPRESSED_VIDEO_FILE, fourcc, 24.0, (width, height))

-out = None  # Video writer instance
-for i in range(0, total_frames, CHUNK_SIZE):
-    uncompressed_frames_chunk = load_frames_from_video(UNCOMPRESSED_VIDEO_FILE, start_frame=i)
-    compressed_frames_chunk = predict_in_chunks(uncompressed_frames_chunk, model, crf_values, preset_speed_values)
+for i in range(total_frames):
+    uncompressed_frame = load_frame_from_video(UNCOMPRESSED_VIDEO_FILE, frame_num=i)
+    compressed_frame = predict_frame(uncompressed_frame, model, crf_value, preset_speed_value)
    
-    # Initialize video writer if it's the first chunk
-    if out is None:
-        height, width = compressed_frames_chunk[0].shape[:2]
-        fourcc = cv2.VideoWriter_fourcc(*'XVID')
-        out = cv2.VideoWriter(COMPRESSED_VIDEO_FILE, fourcc, 24.0, (width, height))
-    
-    save_frames_chunk(compressed_frames_chunk, out)
+    compressed_frame = np.clip(compressed_frame * 255.0, 0, 255).astype(np.uint8)
+    compressed_frame = cv2.cvtColor(compressed_frame, cv2.COLOR_RGB2BGR)
+    out.write(compressed_frame)
+    cv2.imshow("output", compressed_frame)

 out.release()
 print("Compression completed.")
--- a/train_model.py
+++ b/train_model.py
@ -1,16 +1,16 @@
 import os
 import json
-import tensorflow as tf
 import numpy as np
 import cv2
-from video_compression_model import NUM_CHANNELS, NUM_FRAMES, VideoCompressionModel, PRESET_SPEED_CATEGORIES
+import tensorflow as tf
+from video_compression_model import NUM_CHANNELS, VideoCompressionModel, PRESET_SPEED_CATEGORIES
 from tensorflow.keras.callbacks import EarlyStopping

 print(tf.config.list_physical_devices('GPU'))

 # Constants
 BATCH_SIZE = 8
-EPOCHS = 5
+EPOCHS = 50
 TRAIN_SAMPLES = 5

 def load_list(list_path):
@ -18,26 +18,18 @@ def load_list(list_path):
        video_details_list = json.load(json_file)
    return video_details_list

-def load_frames_from_video(video_file, num_frames):
-    print("Extracting video frames...")
+def load_frame_from_video(video_file):
+    print("Extracting video frame...")
    cap = cv2.VideoCapture(video_file)
-    frames = []
-    count = 0
-    while True:
-        ret, frame = cap.read()
-        if not ret:
-            break
-        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-        frames.append(frame)
-        count += 1
-        if count >= num_frames:
-            break
+    ret, frame = cap.read()
+    if not ret:
+        return None
+    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    cap.release()
-    width, height = frame.shape[:2]
-    return frames, width, height
+    return frame

-def preprocess(frames):
-    return np.array(frames) / 255.0
+def preprocess(frame):
+    return frame / 255.0

 def save_model(model, file):
    os.makedirs("models", exist_ok=True)
@ -54,109 +46,62 @@ def load_video_from_list(list_path):
        PRESET_SPEED = PRESET_SPEED_CATEGORIES.index(video_details['preset_speed'])
        video_details['preset_speed'] = PRESET_SPEED

-        train_frames, w, h = load_frames_from_video(os.path.join("test_data/", VIDEO_FILE), NUM_FRAMES * TRAIN_SAMPLES)
+        frame = load_frame_from_video(os.path.join("test_data/", VIDEO_FILE))
        
-        all_frames.extend(train_frames)
-        all_details.append({
-            "frames": train_frames,
-            "width": w,
-            "height": h,
-            "crf": CRF,
-            "preset_speed": PRESET_SPEED,
-            "video_file": VIDEO_FILE
-        })
+        if frame is not None:
+            all_frames.append(preprocess(frame))
+            all_details.append({
+                "frame": frame,
+                "crf": CRF,
+                "preset_speed": PRESET_SPEED,
+                "video_file": VIDEO_FILE
+            })
    return all_details

-def generate_frame_sequences(frames):
-    sequences = []
-    labels = []
-    for i in range(len(frames) - NUM_FRAMES + 1):
-        sequence = frames[i:i+NUM_FRAMES-1]
-        sequences.append(sequence)
-        labels.append(sequence[-1])
-    return np.array(sequences), np.array(labels)
-
-def frame_difference(frames):
-    differences = []
-    for i in range(1, len(frames)):
-        differences.append(cv2.absdiff(frames[i], frames[i-1]))
-    return differences
-
 def main():
    all_video_details_train = load_video_from_list("test_data/training.json")
    all_video_details_val = load_video_from_list("test_data/validation.json")

-    model = VideoCompressionModel(NUM_CHANNELS, NUM_FRAMES)
+    model = VideoCompressionModel(NUM_CHANNELS)
    model.compile(loss='mean_squared_error', optimizer='adam')
    early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1, restore_best_weights=True)

-    # Load and concatenate all sequences and labels
-    all_train_sequences = []
-    all_val_sequences = []
-    all_train_labels = []
-    all_val_labels = []
+    # Prepare data
+    all_train_frames = []
+    all_val_frames = []
    all_crf_train = []
    all_crf_val = []
    all_preset_speed_train = []
    all_preset_speed_val = []
    
    for video_details_train, video_details_val in zip(all_video_details_train, all_video_details_val):
-        train_frames = video_details_train["frames"]
-        val_frames = video_details_val["frames"]
-
-        train_differences = frame_difference(preprocess(train_frames))
-        val_differences = frame_difference(preprocess(val_frames))
-        
-        #print(len(train_differences), train_differences[0].shape)
-
-        train_sequences, train_labels = generate_frame_sequences(train_differences)
-        val_sequences, val_labels = generate_frame_sequences(val_differences)
-
-        crf_array_train = np.full((len(train_sequences), 1), video_details_train['crf'])
-        crf_array_val = np.full((len(val_sequences), 1), video_details_val['crf'])
-        preset_speed_array_train = np.full((len(train_sequences), 1), video_details_train['preset_speed'])
-        preset_speed_array_val = np.full((len(val_sequences), 1), video_details_val['preset_speed'])
-
-        all_train_sequences.extend(train_sequences)
-        all_val_sequences.extend(val_sequences)
-        all_train_labels.extend(train_labels)
-        all_val_labels.extend(val_labels)
-        all_crf_train.extend(crf_array_train)
-        all_crf_val.extend(crf_array_val)
-        all_preset_speed_train.extend(preset_speed_array_train)
-        all_preset_speed_val.extend(preset_speed_array_val)
+        all_train_frames.append(video_details_train["frame"])
+        all_val_frames.append(video_details_val["frame"])
+        all_crf_train.append(video_details_train['crf'])
+        all_crf_val.append(video_details_val['crf'])
+        all_preset_speed_train.append(video_details_train['preset_speed'])
+        all_preset_speed_val.append(video_details_val['preset_speed'])

    # Convert lists to numpy arrays
-    all_train_sequences = np.array(all_train_sequences)
-    all_val_sequences = np.array(all_val_sequences)
-    all_train_labels = np.array(all_train_labels)
-    all_val_labels = np.array(all_val_labels)
+    all_train_frames = np.array(all_train_frames)
+    all_val_frames = np.array(all_val_frames)
    all_crf_train = np.array(all_crf_train)
    all_crf_val = np.array(all_crf_val)
    all_preset_speed_train = np.array(all_preset_speed_train)
    all_preset_speed_val = np.array(all_preset_speed_val)

-    # Shuffle the training data
-    indices_train = np.arange(all_train_sequences.shape[0])
-    np.random.shuffle(indices_train)
-
-    all_train_sequences = all_train_sequences[indices_train]
-    all_train_labels = all_train_labels[indices_train]
-    all_crf_train = all_crf_train[indices_train]
-    all_preset_speed_train = all_preset_speed_train[indices_train]
-
-    print("\nTraining the model on mixed sequences...")
+    print("\nTraining the model on frame pairs...")
    model.fit(
-        {"frames": all_train_sequences, "crf": all_crf_train, "preset_speed": all_preset_speed_train},
-        all_train_labels,
+        {"frame": all_train_frames, "crf": all_crf_train, "preset_speed": all_preset_speed_train},
+        all_val_frames,  # Target is the compressed frame
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
-        validation_data=({"frames": all_val_sequences, "crf": all_crf_val, "preset_speed": all_preset_speed_val}, all_val_labels),
+        validation_data=({"frame": all_val_frames, "crf": all_crf_val, "preset_speed": all_preset_speed_val}, all_val_frames),
        callbacks=[early_stop]
    )
    print("\nTraining completed!")

-    save_model(model, 'model_differencing.keras')
+    save_model(model, 'model.keras')

 if __name__ == "__main__":
    main()
--- a/video_compression_model.py
+++ b/video_compression_model.py
@ -2,7 +2,6 @@ import tensorflow as tf

 PRESET_SPEED_CATEGORIES = ["ultrafast", "superfast", "veryfast", "faster", "fast", "medium", "slow", "slower", "veryslow"]
 NUM_PRESET_SPEEDS = len(PRESET_SPEED_CATEGORIES)
-NUM_FRAMES = 5       # Number of consecutive frames in a sequence
 NUM_CHANNELS = 3     # Number of color channels in the video frames (RGB images have 3 channels)

 #policy = tf.keras.mixed_precision.Policy('mixed_float16')
@ -13,7 +12,6 @@ class VideoCompressionModel(tf.keras.Model):
        super(VideoCompressionModel, self).__init__()

        self.NUM_CHANNELS = NUM_CHANNELS
-        self.NUM_FRAMES = NUM_FRAMES
        
        # Regularization
        self.regularizer = tf.keras.regularizers.l2(regularization_factor)
@ -23,21 +21,24 @@ class VideoCompressionModel(tf.keras.Model):

        # Encoder layers
        self.encoder = tf.keras.Sequential([
-            tf.keras.layers.Conv3D(32, (3, 3, 3), activation='relu', padding='same', input_shape=(None, None, None, NUM_CHANNELS + 1 + 16), kernel_regularizer=self.regularizer),
-            tf.keras.layers.MaxPooling3D((2, 2, 2)),
+            tf.keras.layers.ZeroPadding2D(padding=((1, 1), (1, 1))),  # Padding to preserve spatial dimensions
+            tf.keras.layers.Conv2D(32, (3, 3), activation='relu', padding='same', kernel_regularizer=self.regularizer),
+            tf.keras.layers.MaxPooling2D((2, 2)),
            # Add more encoder layers as needed
        ])

        # Decoder layers
        self.decoder = tf.keras.Sequential([
-            tf.keras.layers.Conv3DTranspose(32, (3, 3, 3), activation='relu', padding='same', kernel_regularizer=self.regularizer),
-            tf.keras.layers.UpSampling3D((2, 2, 2)),
+            tf.keras.layers.Conv2DTranspose(32, (3, 3), activation='relu', padding='same', kernel_regularizer=self.regularizer),
+            tf.keras.layers.UpSampling2D((2, 2)),
            # Add more decoder layers as needed
-            tf.keras.layers.Conv3D(NUM_CHANNELS, (3, 3, 3), activation='sigmoid', padding='same', kernel_regularizer=self.regularizer)  # Output layer for video frames
+            tf.keras.layers.Conv2D(NUM_CHANNELS, (3, 3), activation='sigmoid', padding='same', kernel_regularizer=self.regularizer),  # Output layer for video frames
+            tf.keras.layers.Cropping2D(cropping=((1, 1), (1, 1)))  # Adjust cropping to ensure dimensions match
+
        ])

    def call(self, inputs):
-        frames = inputs["frames"]
+        frame = inputs["frame"]
        crf = tf.expand_dims(inputs["crf"], -1)
        preset_speed = inputs["preset_speed"]

@ -46,15 +47,15 @@ class VideoCompressionModel(tf.keras.Model):
        preset_embedding = tf.keras.layers.Flatten()(preset_embedding)
        
        # Concatenate crf and preset_embedding to frames
-        frames_shape = tf.shape(frames)
-        repeated_crf = tf.tile(tf.reshape(crf, (-1, 1, 1, 1, 1)), [1, frames_shape[1], frames_shape[2], frames_shape[3], 1])
-        repeated_preset = tf.tile(tf.reshape(preset_embedding, (-1, 1, 1, 1, 16)), [1, frames_shape[1], frames_shape[2], frames_shape[3], 1])
+        frame_shape = tf.shape(frame)
+        repeated_crf = tf.tile(tf.reshape(crf, (-1, 1, 1, 1)), [1, frame_shape[1], frame_shape[2], 1])
+        repeated_preset = tf.tile(tf.reshape(preset_embedding, (-1, 1, 1, 16)), [1, frame_shape[1], frame_shape[2], 1])
        
-        frames = tf.concat([frames, repeated_crf, repeated_preset], axis=-1)
+        frame = tf.concat([tf.cast(frame, tf.float32), repeated_crf, repeated_preset], axis=-1)

-        # Encoding the video frames
-        compressed_representation = self.encoder(frames)
+        # Encoding the frame
+        compressed_representation = self.encoder(frame)

-        # Decoding to generate compressed video frames
-        reconstructed_frames = self.decoder(compressed_representation)
-        return reconstructed_frames[:,-1,:,:,:]
+        # Decoding to generate compressed frame
+        reconstructed_frame = self.decoder(compressed_representation)
+        return reconstructed_frame