This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Listen to user speech and convert it to UI commands | |
# Şamil Korkmaz, 20.10.2024 | |
import speech_recognition as sr | |
import openai | |
import time | |
import getpass | |
from datetime import datetime | |
import json | |
def record_audio(duration): | |
"""Record audio from microphone for specified duration""" | |
recognizer = sr.Recognizer() | |
with sr.Microphone() as source: | |
print(f"Recording for {duration} seconds...") | |
audio = recognizer.record(source, duration=duration) | |
print("Recording complete!") | |
return audio | |
def transcribe_audio(audio): | |
"""Transcribe audio using OpenAI Whisper""" | |
try: | |
# Save audio to temporary file | |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
temp_filename = f"temp_audio_{timestamp}.wav" | |
with open(temp_filename, "wb") as f: | |
f.write(audio.get_wav_data()) | |
# Use OpenAI client for Whisper | |
with open(temp_filename, "rb") as audio_file: | |
transcript = openai.audio.transcriptions.create( | |
model="whisper-1", | |
file=audio_file | |
) | |
return transcript.text | |
except Exception as e: | |
print(f"Error in transcription: {str(e)}") | |
return None | |
# Function to get the user command and process it using the chat endpoint | |
def get_user_command_action(user_command): | |
messages = [ | |
{'role': 'system', 'content': 'You are an assistant that provides structured JSON responses based on user commands.'}, | |
{'role': 'user', 'content': f"Interpret the following user command: '{user_command}' and provide the action as a structured JSON response. The available actions are:\n1. log_in\n2. sign_up\n3. view_products\n4. search_products\n5. add_to_cart\n6. checkout\n7. view_orders\n8. log_out\n9. contact_support\nRespond with the appropriate action as: {{\"action\": \"<action_name>\"}}"} | |
] | |
response = openai.chat.completions.create( | |
model='gpt-3.5-turbo', | |
messages=messages | |
) | |
action = response.choices[0].message.content | |
return action | |
def process_action(action_data): | |
action = json.loads(action_data).get('action', 'unknown') | |
# Define action handling logic | |
if action == 'log_in': | |
print("Redirecting to Log In page...") | |
elif action == 'sign_up': | |
print("Redirecting to Sign Up page...") | |
elif action == 'view_products': | |
print("Showing product listings...") | |
elif action == 'search_products': | |
print("Initiating product search...") | |
elif action == 'add_to_cart': | |
print("Adding product to cart...") | |
elif action == 'checkout': | |
print("Proceeding to checkout...") | |
elif action == 'view_orders': | |
print("Showing order history...") | |
elif action == 'log_out': | |
print("Logging out...") | |
elif action == 'contact_support': | |
print("Redirecting to support page...") | |
else: | |
print("Unknown action!") | |
def main(): | |
openai.api_key = "YOUR OPEN AI KEY" | |
try: | |
audio = record_audio(5) | |
# Transcribe audio | |
print("\nTranscribing audio...") | |
transcript = transcribe_audio(audio) | |
if transcript: | |
print(f"\nTranscript: {transcript}") | |
print("\nGetting response from GPT...") | |
response = get_user_command_action(transcript) | |
if response: | |
print("\nGPT Response:") | |
print(response) | |
process_action(response) | |
except Exception as e: | |
print(f"An error occurred: {str(e)}") | |
if __name__ == "__main__": | |
main() |
When you use an API, such as OpenAI, the main disadvantage is that you must pay for every API call. Therefore, using voice commands to control the UI should be limited to paying customers, and there should be rate limits in place to keep costs under control. You might use open-source models like LLaMA to run the AI on your own server, but that would require better computational and memory resources than you currently have.
17.02.2025: Open source AI models like DeepSeek open the door to self hosted AI. You will need a powerful server with lots of RAM and GPU. Such servers might cost more than AI API calls if you use a cloud server. One solution might be to have your own physical server to run the AI model and use the cloud server for the web app, which makes API calls to the AI on your server.
No comments:
Post a Comment