折腾了下在 Emacs 中使用语音接入 GPT,通过语音提问,然后语音朗读回答,具体思路为:
- 用 ffmpeg 录音
- 用 gpt-4o-transcribe 把录音转换为文字
- 用 gpt-4o 回答
- 用 gpt-4o-mini-tts 把文字转换为语音
- 用 ffplay 播放语音
全部代码如下,我在 Mac 上测试过可以跑,其他平台估计需要改 -f avfoundation
(defun chunyang-llm-ask ()
"通过语音使用 GPT.
1. 用 ffmpeg 录音
2. 用 gpt-4o-transcribe 把录音转换为文字
3. 用 gpt-4o 回答
4. 用 gpt-4o-mini-tts 把文字转换为语音
5. 用 ffplay 播放语音"
(interactive)
(let ((question (chunyang-llm--audio-to-text
(chunyang-llm--record-audio))))
(message "You: %s" question)
(let ((answer (chunyang-llm--responses "gpt-4o" question)))
(message "GPT: %s" answer)
(chunyang-llm--play-audio (chunyang-llm--text-to-audio answer)))))
(defun chunyang-llm--responses (model input &optional instructions)
(let ((data (plz 'post
"https://api.openai.com/v1/responses"
;; "http://localhost:4444"
:headers `(("Authorization" . ,(format "Bearer %s" (chunyang-llm--openai-token)))
("Content-Type" . "application/json"))
:body (json-encode
`((model . ,model)
,@(and instructions (list (cons instructions instructions)))
(input . ,input)))
:as #'json-read
:connect-timeout 15)))
(alist-get 'text (aref (alist-get 'content (aref (alist-get 'output data) 0)) 0))))
(defun chunyang-llm--openai-token ()
(auth-source-pick-first-password :host "api.openai.com" :user "apikey"))
(defun chunyang-llm--record-audio ()
(let* ((output-file (concat (make-temp-file "audio-") ".wav"))
(process
(start-process "ffmpeg" (generate-new-buffer " *ffmpeg*")
"ffmpeg"
"-f" "avfoundation"
"-i" ":0"
"-y"
output-file)))
(read-key "Recording audio. Press any key to finish.")
(kill-process process)
(sit-for .1)
output-file))
(defun chunyang-llm--audio-to-text (audio-file)
(with-temp-buffer
(call-process
"curl" nil t nil
"https://api.openai.com/v1/audio/transcriptions"
"-s"
"--fail"
"-H" (format "Authorization: Bearer %s" (chunyang-llm--openai-token))
"-H" "Content-Type: multipart/form-data"
"-F" (format "file=@%s" audio-file)
"-F" "model=gpt-4o-transcribe")
;; {"text":"Hello, this is it."}
(gethash "text" (json-parse-string (buffer-string)))))
(defun chunyang-llm--text-to-audio (string)
(let ((output-file (concat (make-temp-file "audio-") ".wav")))
(delete-file output-file)
(plz 'post "https://api.openai.com/v1/audio/speech"
:headers `(("Authorization" . ,(format "Bearer %s" (chunyang-llm--openai-token)))
("Content-Type" . "application/json"))
:body (json-serialize
`((model . "gpt-4o-mini-tts")
(input . ,string)
(voice . "alloy")
(response_format . "wav")))
:as `(file ,output-file))
output-file))
(defun chunyang-llm--play-audio (audio-file)
(start-process "ffplay" (generate-new-buffer " *ffplay*")
"ffplay" "-nodisp" "-autoexit" audio-file))