DEV Community

Cover image for I Tried Building a Hand Gesture Mouse Controller in Python
ALI
ALI

Posted on

I Tried Building a Hand Gesture Mouse Controller in Python

Why I Made This
Being new to coding, I wanted to make something futuristic, fun, and slightly complicated. I chose to make a Hand Gesture Mouse Controller based on Python, OpenCV, and MediaPipe. The notion of being able to control your computer using your hand alone seemed like science fiction, and I was totally in.

TL;DR: It wasn't flawless, but I gained a lot of insight into image processing, Python libraries, and the ability of hand gestures to direct real-world behavior.

What I Used

  • Python
  • OpenCV (for video processing)
  • MediaPipe (for hand detection and landmarks)
  • PyAutoGUI (for mouse movement and clicking)
  • pycaw (for volume adjustment)

How It Works
Here's the big-picture logic:

  1. Record webcam input using OpenCV.
  2. Find hand landmarks with MediaPipe.
  3. Track finger locations, such as the thumb, index, and pinky.
  4. Translate the hand movement onto screen coordinates.
  5. Perform click, scroll, or volume gestures.
import cv2
import mediapipe as mp
import pyautogui
import numpy as np
import time
import math
from ctypes import cast, POINTER
from comtypes import CLSCTX_ALL
from pycaw.pycaw import AudioUtilities, IAudioEndpointVolume

pyautogui.FAILSAFE = False
wCam, hCam = 640, 480
frameR = 100
smoothening = 6
plocX, plocY = 0, 0
clocX, clocY = 0, 0
click_state = False
scroll_timer = time.time()
screenshot_timer = 0

# Volume control setup
devices = AudioUtilities.GetSpeakers()
interface = devices.Activate(IAudioEndpointVolume._iid_, CLSCTX_ALL, None)
volume = cast(interface, POINTER(IAudioEndpointVolume))
vol_min, vol_max = volume.GetVolumeRange()[:2]

cap = cv2.VideoCapture(0)
cap.set(3, wCam)
cap.set(4, hCam)
screen_w, screen_h = pyautogui.size()

mpHands = mp.solutions.hands
hands = mpHands.Hands(max_num_hands=1, min_detection_confidence=0.75)
mpDraw = mp.solutions.drawing_utils

while True:
    success, img = cap.read()
    img = cv2.flip(img, 1)
    imgRGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    results = hands.process(imgRGB)

    if results.multi_hand_landmarks:
        for handLms in results.multi_hand_landmarks:
            lm = handLms.landmark
            x1 = int(lm[8].x * wCam)
            y1 = int(lm[8].y * hCam)
            cx, cy = int(lm[0].x * wCam), int(lm[0].y * hCam)

            tips = [8, 12, 16, 20]
            fingers = [1 if lm[tip].y < lm[tip - 2].y else 0 for tip in tips]

            if fingers == [1, 0, 0, 0]:
                x3 = np.interp(x1, (frameR, wCam - frameR), (0, screen_w))
                y3 = np.interp(y1, (frameR, hCam - frameR), (0, screen_h))
                clocX = plocX + (x3 - plocX) / smoothening
                clocY = plocY + (y3 - plocY) / smoothening
                pyautogui.moveTo(clocX, clocY)
                plocX, plocY = clocX, clocY

            thumb_tip = lm[4]
            index_tip = lm[8]
            dist_click = np.linalg.norm(np.array([thumb_tip.x, thumb_tip.y]) - np.array([index_tip.x, index_tip.y]))
            if dist_click < 0.03 and not click_state:
                pyautogui.click()
                click_state = True
            elif dist_click > 0.05:
                click_state = False

            if fingers[0] == 1 and fingers[1] == 1:
                if time.time() - scroll_timer > 0.25:
                    if lm[8].y < lm[6].y and lm[12].y < lm[10].y:
                        pyautogui.scroll(-60)
                    elif lm[8].y > lm[6].y and lm[12].y > lm[10].y:
                        pyautogui.scroll(60)
                    scroll_timer = time.time()

            if fingers == [0, 0, 0, 0]:
                x5, y5 = lm[5].x, lm[5].y
                x17, y17 = lm[17].x, lm[17].y
                angle = math.degrees(math.atan2(y17 - y5, x17 - x5))
                if angle > 30:
                    volume.SetMasterVolumeLevel(min(vol_max, volume.GetMasterVolumeLevel() + 1.0), None)
                elif angle < -30:
                    volume.SetMasterVolumeLevel(max(vol_min, volume.GetMasterVolumeLevel() - 1.0), None)

            if fingers == [1, 1, 1, 1]:
                if screenshot_timer == 0:
                    screenshot_timer = time.time()
                elif time.time() - screenshot_timer > 2:
                    pyautogui.screenshot().save("screenshot.png")
                    screenshot_timer = 0
            else:
                screenshot_timer = 0

            mpDraw.draw_landmarks(img, handLms, mpHands.HAND_CONNECTIONS)

    cv2.imshow("Hand Gesture Controller", img)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

Enter fullscreen mode Exit fullscreen mode

Top comments (0)