Today, I will show a source code for a simple CUDA kernel for Tesla T4 in colab google with Python version 3.12.13.
The source code works very well, I will make an video with the results.
The T4 has thousands of CUDA Cores. The kernel divides the image into "Blocks" (16x16 pixel squares) and distributes them across these cores.
This source code is a high-performance image processing pipeline that bridges Python and CUDA (C++) to create a dynamic video effect. By using PyTorch's load_inline feature, it compiles custom GPU code on the fly to manipulate pixels at massive scale.
Loading the Image: It reads the image and moves it from the CPU (System RAM) to the GPU (Video RAM) using .cuda().
Loading the Image: It reads the image and moves it from the CPU (System RAM) to the GPU (Video RAM) using .cuda().
It runs a loop for each video frame, calculating where the "Zoom Ball" should be at that specific millisecond.
The code inside cuda_source is a Kernel, a special function designed to run on thousands of GPU cores simultaneously.
Instead of processing one pixel at a time (like a CPU would), the GPU assigns a specific thread to every single pixel in the image. If your image has 1 million pixels, 1 million threads start working at the same time.
If a pixel is inside the ball's radius, the thread "re-maps" its coordinate. It looks at the original image but pulls a pixel from closer to the center of the ball.
Because the zoom calculation for the top-left pixel doesn't depend on the bottom-right pixel, the GPU completes the entire frame transformation in microseconds.
Let's see the source code:
import torch
from torch.utils.cpp_extension import load_inline
from PIL import Image
import numpy as np
import cv2 # Pentru a salva animația ca video
from google.colab import files # Pentru a descărca rezultatul
import os
import shutil
# --- 0. Curățare și Pregătire Mediu ---
# Instalăm ninja dacă nu există
!pip install ninja -q
# Ștergem cache-ul vechi pentru a forța recompilarea curată
extensions_dir = '/root/.cache/torch_extensions/py312_cu128/swim_animator'
if os.path.exists(extensions_dir):
shutil.rmtree(extensions_dir)
print(f"Cache șters: {extensions_dir}")
# Verificăm GPU
print(f"GPU disponibil: {torch.cuda.is_available()}")
if torch.cuda.is_available():
print(f"Nume GPU: {torch.cuda.get_device_name(0)}")
# --- 1. Codul CUDA (Efect de Bilă de Zoom) ---
cuda_source = r"""
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <math.h>
__global__ void zoom_ball_kernel(
const unsigned char* __restrict__ input,
unsigned char* __restrict__ output,
int width, int height, float ball_center_x, float max_zoom)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < width && y < height) {
float ball_center_y = height / 2.0f;
float ball_radius = height / 2.0f; // Bila e înaltă cât imaginea
// 1. Calculăm distanța de la pixel la centrul bilei
float dx = x - ball_center_x;
float dy = y - ball_center_y;
float distance = sqrtf(dx*dx + dy*dy);
float src_x = (float)x;
float src_y = (float)y;
// 2. Dacă pixelul e în interiorul bilei, aplicăm zoom sferic
if (distance < ball_radius) {
// Factor de atenuare: 1 în centru, 0 la margine (smooth)
float norm_dist = distance / ball_radius;
float falloff = cosf(norm_dist * 3.14159f * 0.5f); // Smooth curve
// Calculăm zoom-ul local (maxim în centru, 1 la margine)
float current_zoom = 1.0f + (max_zoom - 1.0f) * falloff;
// Coordonate sursă modificate sferic față de centrul bilei
src_x = ball_center_x + (dx / current_zoom);
src_y = ball_center_y + (dy / current_zoom);
}
// 3. Limităm coordonatele la dimensiunea imaginii (handling edge cases)
int isrc_x = max(0, min(width - 1, (int)src_x));
int isrc_y = max(0, min(height - 1, (int)src_y));
int out_idx = (y * width + x) * 3;
int src_idx = (isrc_y * width + isrc_x) * 3;
output[out_idx] = input[src_idx]; // R
output[out_idx + 1] = input[src_idx + 1]; // G
output[out_idx + 2] = input[src_idx + 2]; // B
}
}
torch::Tensor apply_zoom_ball_effect(torch::Tensor input, float ball_center_x, float max_zoom) {
const int height = input.size(0);
const int width = input.size(1);
auto output = torch::empty_like(input);
dim3 block_dim(16, 16);
dim3 grid_dim((width + block_dim.x - 1) / block_dim.x,
(height + block_dim.y - 1) / block_dim.y);
zoom_ball_kernel<<<grid_dim, block_dim>>>(
input.data_ptr<unsigned char>(),
output.data_ptr<unsigned char>(),
width, height, ball_center_x, max_zoom);
return output;
}
"""
# Header C++ necesar
cpp_source = """
#include <torch/extension.h>
torch::Tensor apply_zoom_ball_effect(torch::Tensor input, float ball_center_x, float max_zoom);
"""
# --- 2. Compilare (JIT) ---
print("Compilăm kernel-ul CUDA... (poate dura 30-60 secunde)")
swim_module = load_inline(
name="swim_animator",
cpp_sources=cpp_source,
cuda_sources=cuda_source,
functions=["apply_zoom_ball_effect"],
extra_cuda_cflags=["-arch=sm_75"],
verbose=False
)
print("Compilare reușită!")
# --- 3. Funcția de procesare animație ---
def create_zoom_ball_video(image_path, output_video="bila_zoom.mp4"):
# Încărcare imagine
if not os.path.exists(image_path):
print(f"Eroare: Imaginea {image_path} nu există.")
return
img_pil = Image.open(image_path).convert('RGB')
width, height = img_pil.size
img_tensor = torch.from_numpy(np.array(img_pil)).cuda()
# Configurare video
fps = 30
duration_sec = 5 # Puțin mai lung pentru a vedea toată traversarea
num_frames = fps * duration_sec
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
video_writer = cv2.VideoWriter(output_video, fourcc, fps, (width, height))
print(f"Generăm {num_frames} cadre pe GPU...")
# Parametrii efectului
max_zoom = 2.5 # Cât de puternic e zoom-ul în centrul bilei
ball_radius = height / 2.0
# Calculăm traiectoria: din dreapta-afară în stânga-afară
# Start: width + radius, End: -radius
start_x = width + ball_radius
end_x = -ball_radius
for i in range(num_frames):
# Progresul animației de la 0.0 la 1.0
progress = i / (num_frames - 1)
# Poziția X curentă a bilei (interpolare liniară)
current_ball_x = start_x + (end_x - start_x) * progress
# Apelăm kernel-ul CUDA
frame_tensor = swim_module.apply_zoom_ball_effect(img_tensor, current_ball_x, max_zoom)
# Înapoi pe CPU și în format OpenCV (BGR)
frame_np = frame_tensor.cpu().numpy()
frame_bgr = cv2.cvtColor(frame_np, cv2.COLOR_RGB2BGR)
video_writer.write(frame_bgr)
video_writer.release()
print("Video salvat!")
# --- 4. Execuție ---
# Descărcăm o imagine de test mai lată pentru a vedea efectul de traversare
#!wget -O peisaj.jpg https://upload.wikimedia.org/wikipedia/commons/thumb/c/c8/Altja_j%C3%B5gi_Lahemaal.jpg/1280px-Altja_j%C3%B5gi_Lahemaal.jpg
!wget -O peste.jpg https://upload.wikimedia.org/wikipedia/commons/thumb/7/7f/Balantiocheilos_melanopterus_-_Karlsruhe_Zoo_02_%28cropped%29.jpg/960px-Balantiocheilos_melanopterus_-_Karlsruhe_Zoo_02_%28cropped%29.jpg
create_zoom_ball_video("peste.jpg")
# Descarcă rezultatul
files.download("bila_zoom.mp4")