Why I Made This
Being new to coding, I wanted to make something futuristic, fun, and slightly complicated. I chose to make a Hand Gesture Mouse Controller based on Python, OpenCV, and MediaPipe. The notion of being able to control your computer using your hand alone seemed like science fiction, and I was totally in.
TL;DR: It wasn't flawless, but I gained a lot of insight into image processing, Python libraries, and the ability of hand gestures to direct real-world behavior.
What I Used
- Python
- OpenCV (for video processing)
- MediaPipe (for hand detection and landmarks)
- PyAutoGUI (for mouse movement and clicking)
- pycaw (for volume adjustment)
How It Works
Here's the big-picture logic:
- Record webcam input using OpenCV.
- Find hand landmarks with MediaPipe.
- Track finger locations, such as the thumb, index, and pinky.
- Translate the hand movement onto screen coordinates.
- Perform click, scroll, or volume gestures.
import cv2
import mediapipe as mp
import pyautogui
import numpy as np
import time
import math
from ctypes import cast, POINTER
from comtypes import CLSCTX_ALL
from pycaw.pycaw import AudioUtilities, IAudioEndpointVolume
pyautogui.FAILSAFE = False
wCam, hCam = 640, 480
frameR = 100
smoothening = 6
plocX, plocY = 0, 0
clocX, clocY = 0, 0
click_state = False
scroll_timer = time.time()
screenshot_timer = 0
# Volume control setup
devices = AudioUtilities.GetSpeakers()
interface = devices.Activate(IAudioEndpointVolume._iid_, CLSCTX_ALL, None)
volume = cast(interface, POINTER(IAudioEndpointVolume))
vol_min, vol_max = volume.GetVolumeRange()[:2]
cap = cv2.VideoCapture(0)
cap.set(3, wCam)
cap.set(4, hCam)
screen_w, screen_h = pyautogui.size()
mpHands = mp.solutions.hands
hands = mpHands.Hands(max_num_hands=1, min_detection_confidence=0.75)
mpDraw = mp.solutions.drawing_utils
while True:
success, img = cap.read()
img = cv2.flip(img, 1)
imgRGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
results = hands.process(imgRGB)
if results.multi_hand_landmarks:
for handLms in results.multi_hand_landmarks:
lm = handLms.landmark
x1 = int(lm[8].x * wCam)
y1 = int(lm[8].y * hCam)
cx, cy = int(lm[0].x * wCam), int(lm[0].y * hCam)
tips = [8, 12, 16, 20]
fingers = [1 if lm[tip].y < lm[tip - 2].y else 0 for tip in tips]
if fingers == [1, 0, 0, 0]:
x3 = np.interp(x1, (frameR, wCam - frameR), (0, screen_w))
y3 = np.interp(y1, (frameR, hCam - frameR), (0, screen_h))
clocX = plocX + (x3 - plocX) / smoothening
clocY = plocY + (y3 - plocY) / smoothening
pyautogui.moveTo(clocX, clocY)
plocX, plocY = clocX, clocY
thumb_tip = lm[4]
index_tip = lm[8]
dist_click = np.linalg.norm(np.array([thumb_tip.x, thumb_tip.y]) - np.array([index_tip.x, index_tip.y]))
if dist_click < 0.03 and not click_state:
pyautogui.click()
click_state = True
elif dist_click > 0.05:
click_state = False
if fingers[0] == 1 and fingers[1] == 1:
if time.time() - scroll_timer > 0.25:
if lm[8].y < lm[6].y and lm[12].y < lm[10].y:
pyautogui.scroll(-60)
elif lm[8].y > lm[6].y and lm[12].y > lm[10].y:
pyautogui.scroll(60)
scroll_timer = time.time()
if fingers == [0, 0, 0, 0]:
x5, y5 = lm[5].x, lm[5].y
x17, y17 = lm[17].x, lm[17].y
angle = math.degrees(math.atan2(y17 - y5, x17 - x5))
if angle > 30:
volume.SetMasterVolumeLevel(min(vol_max, volume.GetMasterVolumeLevel() + 1.0), None)
elif angle < -30:
volume.SetMasterVolumeLevel(max(vol_min, volume.GetMasterVolumeLevel() - 1.0), None)
if fingers == [1, 1, 1, 1]:
if screenshot_timer == 0:
screenshot_timer = time.time()
elif time.time() - screenshot_timer > 2:
pyautogui.screenshot().save("screenshot.png")
screenshot_timer = 0
else:
screenshot_timer = 0
mpDraw.draw_landmarks(img, handLms, mpHands.HAND_CONNECTIONS)
cv2.imshow("Hand Gesture Controller", img)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
Top comments (0)