13
Day5: Gesture Based Writing Method
Hello surfer, welcome to this blog where I will be writing a Python code to write something in canvas by moving fingers in front of the camera. Nearly a year ago, I was doing this project but I thougt if I could use just contours then why use gestures? In that project, I used concept of background subtraction and then with the help of contour extraction, I wrote something on canvas. It was 6 part series of blogs and I hope you will find it interesting but I am going to do the same task except using the gestures except of contours and I am not planning on using VUI (Virtual User Interface) here too because if I can use gestures then why bother writng complex code?
This blog is the part of the series #7DaysOfComputerVisionProjects. Links to the blogs and videos of each projects are:
And here is the demo of that system:
Everything will eb different in this project but the end goal is somewhat similar. I will use concept of gestures from earlier projects.
import mediapipe as mp
import cv2
import numpy as np
import matplotlib.pyplot as plt
We will need to map our finger landmarks from pixel world to the new canvas world.
def show(img, fsize=(10, 10)):
figure = plt.figure(figsize=fsize)
plt.imshow(img)
plt.show()
show(np.zeros((10, 10)))
def frame_pos2screen_pos(frame_size=(480, 640), screen_size=(768, 1366), frame_pos=None):
x,y = screen_size[0]/frame_size[0], screen_size[1]/frame_size[1]
screen_pos = [frame_pos[0]*y, frame_pos[1]*x]
return np.array(screen_pos).astype(np.uint32)
def euclidean(pt1, pt2):
d = np.sqrt((pt1[0]-pt2[0])**2+(pt1[1]-pt2[1])**2)
return d
euclidean((4, 3), (0, 0))

5.0
cam = cv2.VideoCapture(0)
fsize
as frame simze and ssize
as screen size.
fsize = (520, 720)
ssize = (520, 720)
bg_color = (100, 100, 100)
pen_size = 8
ink_size = 10
colors = {"red": [0, 0, 255], "blue":[255, 0, 0], "green":[0, 255, 0]}
modes = ["write", "move", "erase"]
current_mode = "draw"
last_mode = None
current_color = "red"
cnames = list(colors.keys())
ink_color = colors[current_color]
pen_color = np.array(ink_color)-[100, 50, 100]
pen_color = pen_color.astype(np.uint8).tolist()
mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands
left, top, right, bottom = (400, 100, 700, 300)
canvas = np.zeros((ssize[0], ssize[1], 3))+bg_color
canvas = canvas.astype(np.uint8)
check_every = 5
check_cnt = 0
with mp_hands.Hands(static_image_mode=True,
max_num_hands = 1,
min_detection_confidence=0.2) as hands:
while cam.isOpened():
ret, frame = cam.read()
if not ret:
continue
frame = cv2.GaussianBlur(frame, (5, 5), -1)
temp_canvas
is a canvas that we define in every frame and add new changes into it. Similarly, we will use pen_canvas
just to view pointer. Both should be of uint8
.
temp_canvas = np.zeros((ssize[0], ssize[1], 3)).astype(np.uint8)
pen_canvas = np.zeros((ssize[0], ssize[1], 3)).astype(np.uint8)
frame = cv2.flip(frame, 1)
frame = cv2.resize(frame, (fsize[1], fsize[0]))
h, w, _ = frame.shape
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
res = hands.process(rgb)
if res.multi_hand_landmarks:
for hand_landmarks in res.multi_hand_landmarks:
index_tip = mp_drawing._normalized_to_pixel_coordinates(
hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP].x,
hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP].y,
w, h)
index_pip = np.array(mp_drawing._normalized_to_pixel_coordinates(
hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_PIP].x,
hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_PIP].y,
w, h))
thumb_tip = mp_drawing._normalized_to_pixel_coordinates(
hand_landmarks.landmark[mp_hands.HandLandmark.THUMB_TIP].x,
hand_landmarks.landmark[mp_hands.HandLandmark.THUMB_TIP].y,
w, h)
middle_tip = mp_drawing._normalized_to_pixel_coordinates(
hand_landmarks.landmark[mp_hands.HandLandmark.MIDDLE_FINGER_TIP].x,
hand_landmarks.landmark[mp_hands.HandLandmark.MIDDLE_FINGER_TIP].y,
w, h)
index_tipm = list(index_tip) # make list
index_tipm[0] = np.clip(index_tipm[0], left, right) # clip cloumn value within left to right
index_tipm[1] = np.clip(index_tipm[1], top, bottom) # clip row value within top to bottom
index_tipm[0] = (index_tipm[0]-left) * fsize[1]/(right-left) # column value from ROI to frame world
index_tipm[1] = (index_tipm[1]-top) * fsize[0]/(bottom-top) # row value from ROI to frame world
# convert coordinate from frame world to canvas or screen world
screen_pos = frame_pos2screen_pos(fsize, ssize, index_tipm)
if check_cnt >= check_every:
if thumb_tip is not None and index_pip is not None:
if euclidean(thumb_tip, index_pip)<60:
if current_mode == "draw" or current_mode == "erase":
current_mode = "move"
last_mode = "draw"
elif current_mode == "move":
current_mode="draw"
last_mode = "move"
if middle_tip is not None and index_tip is not None:
if euclidean(index_tip, middle_tip)<60:
if current_mode != "erase":
current_mode = "erase"
else:
current_mode = "move"
last_mode = "erase"
if thumb_tip is not None and middle_tip is not None:
if euclidean(thumb_tip, middle_tip)<60:
current_color = cnames[(cnames.index(current_color)+1)%len(cnames)]
ink_color = colors[current_color]
pen_color = np.array(ink_color)-[100, 50, 100]
pen_color = pen_color.astype(np.uint8).tolist()
# print(pen_color)
check_cnt = 0
if current_mode=="draw":
cv2.circle(temp_canvas, screen_pos, int(ink_size), ink_color, -1)
if current_mode=="erase":
cv2.circle(temp_canvas, screen_pos, int(ink_size), bg_color, -1)
cv2.circle(pen_canvas, screen_pos, int(pen_size), pen_color, -1)
mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
if current_mode in ["draw", "erase"]:
mask = np.sum(temp_canvas, axis=-1)!=0
canvas[mask]=temp_canvas[mask]
ncanvas = canvas.copy()
mask = np.sum(pen_canvas, axis=-1)!=0
ncanvas[mask]=pen_canvas[mask]
uint8
type.
d = np.zeros((100, ssize[1], 3), dtype="int")
text = f"Mode: {current_mode} | Color: {current_color} | Pen: {ink_size}"
cv2.putText(d, text, (10, 70), cv2.FONT_HERSHEY_COMPLEX, 1, (200, 100, 200), 2)
ncanvas = np.vstack([d, ncanvas]).astype(np.uint8)
nshape = ncanvas.shape
# draw a ROI rectangle
cv2.rectangle(frame, (left, top), (right, bottom), (0, 0, 255), 1)
frame = cv2.resize(frame, (nshape[1], nshape[0]))
cv2.imshow("Window", temp_canvas)
cv2.imshow("Canvas", np.hstack([frame, ncanvas]).astype(np.uint8))
check_cnt += 1
if cv2.waitKey(1)&0xFF == 27:
break
cam.release()
#out.release()
cv2.destroyAllWindows()
cam = cv2.VideoCapture(0)
fsize = (520, 720)
ssize = (520, 720)
bg_color = (100, 100, 100)
pen_size = 15
ink_size = 13
colors = {"red": [0, 0, 255], "blue":[255, 0, 0], "green":[0, 255, 0]}
modes = ["write", "move", "erase"]
current_mode = "draw"
last_mode = None
current_color = "red"
cnames = list(colors.keys())
ink_color = colors[current_color]
pen_color = np.array(ink_color)-[100, 50, 100]
pen_color = pen_color.astype(np.uint8).tolist()
mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands
left, top, right, bottom = (400, 100, 700, 300)
canvas = np.zeros((ssize[0], ssize[1], 3))+bg_color
canvas = canvas.astype(np.uint8)
check_every = 5
check_cnt = 0
#out = cv2.VideoWriter("out.avi", cv2.VideoWriter_fourcc(*'XVID'), 30, (fsize[1], fsize[0]))
with mp_hands.Hands(static_image_mode=True,
max_num_hands = 1,
min_detection_confidence=0.2) as hands:
while cam.isOpened():
ret, frame = cam.read()
if not ret:
continue
frame = cv2.GaussianBlur(frame, (5, 5), -1)
temp_canvas = np.zeros((ssize[0], ssize[1], 3)).astype(np.uint8)
pen_canvas = np.zeros((ssize[0], ssize[1], 3)).astype(np.uint8)
frame = cv2.flip(frame, 1)
frame = cv2.resize(frame, (fsize[1], fsize[0]))
h, w, _ = frame.shape
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
res = hands.process(rgb)
if res.multi_hand_landmarks:
for hand_landmarks in res.multi_hand_landmarks:
index_tip = mp_drawing._normalized_to_pixel_coordinates(
hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP].x,
hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP].y,
w, h)
index_pip = np.array(mp_drawing._normalized_to_pixel_coordinates(
hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_PIP].x,
hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_PIP].y,
w, h))
thumb_tip = mp_drawing._normalized_to_pixel_coordinates(
hand_landmarks.landmark[mp_hands.HandLandmark.THUMB_TIP].x,
hand_landmarks.landmark[mp_hands.HandLandmark.THUMB_TIP].y,
w, h)
middle_tip = mp_drawing._normalized_to_pixel_coordinates(
hand_landmarks.landmark[mp_hands.HandLandmark.MIDDLE_FINGER_TIP].x,
hand_landmarks.landmark[mp_hands.HandLandmark.MIDDLE_FINGER_TIP].y,
w, h)
#print(index_tip)
index_tipm = list(index_tip)
index_tipm[0] = np.clip(index_tipm[0], left, right)
index_tipm[1] = np.clip(index_tipm[1], top, bottom)
index_tipm[0] = (index_tipm[0]-left) * fsize[1]/(right-left)
index_tipm[1] = (index_tipm[1]-top) * fsize[0]/(bottom-top)
#print(index_tipm)
screen_pos = frame_pos2screen_pos(fsize, ssize, index_tipm)
if check_cnt >= check_every:
if thumb_tip is not None and index_pip is not None:
if euclidean(thumb_tip, index_pip)<60:
if current_mode == "draw" or current_mode == "erase":
current_mode = "move"
last_mode = "draw"
elif current_mode == "move":
current_mode="draw"
last_mode = "move"
if middle_tip is not None and index_pip is not None:
if euclidean(index_tip, middle_tip)<60:
if current_mode != "erase":
current_mode = "erase"
else:
current_mode = "move"
last_mode = "erase"
if thumb_tip is not None and middle_tip is not None:
if euclidean(thumb_tip, middle_tip)<60:
current_color = cnames[(cnames.index(current_color)+1)%len(cnames)]
ink_color = colors[current_color]
pen_color = np.array(ink_color)-[100, 50, 100]
pen_color = pen_color.astype(np.uint8).tolist()
# print(pen_color)
check_cnt = 0
if current_mode=="draw":
cv2.circle(temp_canvas, screen_pos, int(ink_size), ink_color, -1)
if current_mode=="erase":
cv2.circle(temp_canvas, screen_pos, int(ink_size), bg_color, -1)
cv2.circle(pen_canvas, screen_pos, int(pen_size), pen_color, -1)
mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
if current_mode in ["draw", "erase"]:
mask = np.sum(temp_canvas, axis=-1)!=0
canvas[mask]=temp_canvas[mask]
ncanvas = canvas.copy()
mask = np.sum(pen_canvas, axis=-1)!=0
ncanvas[mask]=pen_canvas[mask]
d = np.zeros((100, ssize[1], 3), dtype="int")
text = f"Mode: {current_mode} | Color: {current_color} | Pen: {ink_size}"
cv2.putText(d, text, (10, 70), cv2.FONT_HERSHEY_COMPLEX, 1, (200, 100, 200), 2)
ncanvas = np.vstack([d, ncanvas]).astype(np.uint8)
nshape = ncanvas.shape
cv2.rectangle(frame, (left, top), (right, bottom), (0, 0, 255), 1)
frame = cv2.resize(frame, (nshape[1], nshape[0]))
check_cnt += 1
cv2.imshow("Window", temp_canvas)
cv2.imshow("Canvas", np.hstack([frame, ncanvas]).astype(np.uint8))
if cv2.waitKey(1)&0xFF == 27:
break
cam.release()
#out.release()
cv2.destroyAllWindows()
This was just a demo of what can we do by moving just fingers in front of the camera. I have already done more advanced tasks by contours previously and my next goal will be to add features like sliding cursor size, changing background colors etc. But in the mean time please keep notified on our YouTube for more contents related to this.
13