whisper.cpp/examples/lsp/whisper.vim

if !exists("g:whisper_dir")
    let g:whisper_dir = expand($WHISPER_CPP_HOME)
    if g:whisper_dir == ""
        echoerr "Please provide a path to the whisper.cpp repo in either the $WHISPER_CPP_HOME environment variable, or g:whisper_dir"
    endif
endif
if !exists("g:whisper_lsp_path")
    let g:whisper_lsp_path = g:whisper_dir .. "lsp"
    if !filereadable(g:whisper_lsp_path)
        echoerr "Was not able to locate a lsp executable at: " .. g:whisper_lsp_path
        throw "Executable not found"
    endif
endif
if !exists("g:whisper_model_path")
    " TODO: allow custom paths relative to the repo dir
    let g:whisper_model_path = g:whisper_dir .. "models/ggml-base.en.bin"
    if !filereadable(g:whisper_model_path)
        echoerr "Could not find model at: " .. g:whisper_model_path
        throw "Model not found"
    endif
endif
let s:output_buffer = bufnr("whisper_log", v:true)
call setbufvar(s:output_buffer,"&buftype","nofile")
let s:lsp_command = [g:whisper_lsp_path,"-m",g:whisper_model_path]
" For faster execution. TODO: server load multiple models/run multiple servers?
" let s:lsp_command = [g:whisper_lsp_path, "-m", g:whisper_dir .. "models/ggml-tiny.en.bin", "-ac", "128"]

" requestCommands([params_dict])
func whisper#requestCommands(...)
    let l:req = {"method": "guided", "params": {"commandset_index": 0}}
    if a:0 > 0
        call extend(l:req.params, a:1)
    endif
    let resp = ch_sendexpr(g:lsp_job, l:req, {"callback": function("s:commandCallback", [l:req.params, 0])})
endfunction

" doTranscription([params_dict])
func whisper#doTranscription(...)
    let l:req = {"method": "unguided", "params": {}}
    if a:0 > 0
        call extend(l:req.params, a:1)
    endif
    let resp = ch_sendexpr(g:lsp_job, l:req, {"callback": function("s:transcriptionCallback", [function("s:insertText"),function("s:endTranscription")])})
endfunction

" For testing
func whisper#uppertest(cha)
    echo tr(a:cha, s:c_lowerkeys, s:c_upperkeys)
endfunction


" (upper, exit, count, motion, command, insert/append, save run) "base"
" (upper, exit, count, motion, command, inside/around)           "motion/visual"
" (upper, exit, count, motion, line,    inside/around)           "command already entered"
" (upper, exit, key,                                 )           "from/till"

" upper and lower keys is used to translate between cases with tr
" Must be sunchronized
let s:c_lowerkeys = "1234567890-=qwertyuiop[]\\asdfghjkl;'zxcvbnm,./\""
let s:c_upperkeys = "!@#$%^&*()_+QWERTYUIOP{}|ASDFGHJKL:\"ZXCVBNM<>?'"
let s:c_count = split("1234567890\"",'\zs')
let s:c_command = split("ryuogpdxcv.iam", '\zs')
let s:c_motion = split("wetf'hjklnb$^)",'\zs')
" object words: Word, Sentence, Paragraph, [, (, <, Tag, {. ", '
let s:c_area = split("wsp])>t}\"'",'\zs')
"Special commands.
let s:c_special_always = ["exit", "upper"]
let s:c_special_normal = ["save", "run", "space"]

" If not in dict, key is spoken word,
" If key resolves to string, value is used for normal/motion, but key for chars
" If key resolves to dict, {0: "normal",1: "motion",2:"single char",3: "area"}
" Missing entries fall back as follows {0: "required", 1: 0, 2: "key", 3: 0}
let s:spoken_dict = {"w": "word", "e": "end", "r": "replace", "t": {0: "till", 3: "tag"}, "y": "yank", "u": "undo", "i": {0: "insert", 1: "inside"}, "o": "open", "p": {0: "paste", 3: "paragraph"},  "a": {0: "append", 1: "around"}, "s": {0: "substitute", 3: "sentence"}, "d": "delete", "f": "from", "g": "go", "h": "left", "j": "down", "k": "up", "l": "right", "c": "change", "v": "visual", "b": "back", "n": "next", "m": "mark", ".": {0: "repeat", 2: "period"}, "]": {0: "bracket", 2: "bracket"}, "'": {0: "jump", 2: "apostrophe", 3:  "apostrophe"}, '"': {0: 'register', 2: "quotation", 3: "quotation"}, "-": {0: "minus", 2: "minus"}, "$": {0: "dollar", 2: "dollar"}, "^": {0: "carrot", 2: "carrot"}, ")": {0: "sentence", 2: "parenthesis",  3: "parenthesis"}, "}": {0: "paragraph", 2: "brace", 3: "brace"}, ">": {0: "indent", 2: "angle", 3: "angle"}}

" Give this another pass. This seems overly hacky even if it's functional
let s:sub_tran_msg = ""
func s:subTranProg(msg)
    if s:sub_tran_msg != ""
        let s:sub_tran_msg = s:sub_tran_msg .. a:msg
        if mode() !=? 'v'
            exe "normal" "u" .. s:sub_tran_msg
        endif
    else
        if s:command_backlog == ""
            " this should not occur
            call s:logCallback(0, "Warning: Encountered sub transcription without prior command")
            let s:command_backlog = "a"
        endif
        if a:msg[0] == ' '
            let s:sub_tran_msg = s:command_backlog .. a:msg[1:-1]
        else
            let s:sub_tran_msg = s:command_backlog  .. a:msg
        endif
        if mode() !=? 'v'
            exe "normal" s:sub_tran_msg
        endif
    endif
    call appendbufline(s:output_buffer, "$", s:sub_tran_msg ..  ":" .. string(a:msg ))
endfunction

func s:subTranFinish(params, timestamp)
    let s:repeat_command = s:sub_tran_msg
    " Visual selection is lot if used with streaming, so streaming of partial
    " transcriptions is disabled in visual mode
    if mode() ==? 'v'
        exe "normal" s:sub_tran_msg
    endif
    let s:sub_tran_msg = ""
    let s:command_backlog = ""
    exe "normal a\<C-G>u"
    let l:params = a:params
    let l:params.timestamp = a:timestamp
    if exists("l:params.commandset_index")
        unlet l:params.commandset_index
    endif
    call whisper#requestCommands(a:params)
endfunction

func s:logCallback(channel, msg)
    call appendbufline(s:output_buffer,"$",a:msg)
endfunction


func s:transcriptionCallback(progressCallback, finishedCallback, channel, msg)
    let l:tr = a:msg.result.transcription

    let l:ex_ind = match(tolower(l:tr),"exit", len(l:tr)-6)
    " The worst case I've observed so far is " Exit.", which is 6 characters
    if l:ex_ind != -1
        call a:progressCallback(strpart(l:tr,0,l:ex_ind-1))
        call a:finishedCallback(a:msg.result.timestamp)
    else
        call a:progressCallback(l:tr)
        let req = {"method": "unguided", "params": {"timestamp": a:msg.result.timestamp, "no_context": v:true}}
        let resp = ch_sendexpr(g:lsp_job, req, {"callback": function("s:transcriptionCallback", [a:progressCallback, a:finishedCallback])})
    endif
endfunc
func s:insertText(msg)
    exe "normal a" .. a:msg
endfunction
func s:endTranscription(timestamp)
    call appendbufline(s:output_buffer, "$", "Ending unguided transcription")
endfunction


" If a command does not include a whole actionable step, attempting to execute
" it discards the remainder of things. There is likely a simpler solution,
" but it can be made functional now by storing a backbuffer until actionable
let s:command_backlog = ""
let s:repeat_command = ""
let s:preceeding_upper = v:false
func s:commandCallback(params, commandset_index, channel, msg)
    let l:command_index = a:msg.result.command_index
    let l:do_execute = v:false
    let l:next_mode = a:commandset_index
    let l:command = s:commandset_list[a:commandset_index][l:command_index]
    call s:logCallback(0, string(a:msg) .. " " .. a:commandset_index .. " " .. l:command)
    if l:command_index == 0
        "exit
        "if s:command_backlog == ""
        call s:logCallback(0,"Stopping command mode")
        echo "No longer listening"
        let s:command_backlog = ""
        return
        "else
        " Legacy code to clear an existing buffer with exit.
        " Was found to be rarely desired and is better introduced as a
        " standalone command (clear?)
        "   call s:logCallback(0,"Clearing command_backlog" .. s:command_backlog)
        "   let s:command_backlog = ""
        "   let s:preceeding_upper = v:false
        " endif
    elseif l:command_index == 1
        " upper
        let s:preceeding_upper = !s:preceeding_upper
    elseif l:command == "save"
        " save and run can only happen in commandset 0,
        exe "w"
    elseif l:command == "run"
        exe "make run"
    elseif l:command == "space"
        exe "normal i \<ESC>l"
    elseif has_key(s:c_user, l:command)
        let Userfunc = s:c_user[l:command]
        if type(Userfunc) == v:t_string
            let Userfunc = function(Userfunc)
        endif
        call Userfunc()
    else
        if s:preceeding_upper
            " Upper should keep commandset
            let s:preceeding_upper = v:false
            let l:visual_command = tr(l:command, s:c_lowerkeys, s:c_upperkeys)
        else
            let l:visual_command = l:command
        endif
        echo s:command_backlog .. " - " .. l:visual_command
        let s:command_backlog = s:command_backlog .. l:visual_command
        if a:commandset_index == 2 || a:commandset_index == 3
            " single key, either completes motion, replace, or register
            " Should move to execute unless part of a register
            " Change will be caught at execute
            if s:command_backlog[-2:-2] !=# '"'
                call s:logCallback(0,"not register")
                let l:do_execute = v:true
            end
            let l:next_mode = 0
            " commandset index only matters for a/i
        elseif (l:command == "a" || l:command == "i") && a:commandset_index == 1
            " inside/around. Is commandset 3
            let l:next_mode = 3
        elseif l:command ==# '"'
            let l:next_mode = 2
        elseif index(s:c_count, l:command) != -1
            let l:next_mode = a:commandset_index
        elseif index(s:c_motion, l:command) != -1
            if l:command == 't' || l:command == 'f' || l:command == "'"
                " prompt single key
                let l:next_mode = 2
            else
                let l:do_execute = v:true
                let l:next_mode = 0
            endif
        elseif index(s:c_command, l:command) != -1
            if index(["y","g","d","c"], s:command_backlog[-1:-1]) != -1 && s:command_backlog[-1:-1] != s:command_backlog[-2:-2] && mode() !=? 'v'
                " need motion or repeated command
                " Potential for bad state here if disparaging command keys are
                " entered (i.e. yd), but vim can handle checks for this at exe
                " And checking for cases like y123d would complicate things
                let l:next_mode = 1
            elseif index(["i","a","c", "o", "s"], l:command) != -1 || s:command_backlog[-1:-1] ==# 'R'
                "'Insert' mode, do general transcription
                let l:req = {"method": "unguided", "params": a:params}
                let l:req.params.timestamp = a:msg.result.timestamp
                let l:req.params.no_context = v:true
                let resp = ch_sendexpr(g:lsp_job, req, {"callback": function("s:transcriptionCallback", [function("s:subTranProg"), function("s:subTranFinish", [a:params])])})
                return
            elseif l:command == 'r' || l:command == 'm'
                let l:next_mode = 2
            elseif l:command == '.'
                let l:next_mode = 0
                let l:do_execute = v:true
                let s:command_backlog = s:command_backlog[0:-2] .. s:repeat_command
            else
                if l:command ==? 'v'
                    let l:next_mode = 1
                else
                    let l:next_mode = 0
                endif
                let l:do_execute = v:true
            endif
        else
            throw "Invalid command state: " .. l:command .. " " .. a:commandset_index .. " " .. s:command_backlog
        endif
    endif
    if l:do_execute
        if mode() ==?'v' && l:next_mode == 0
            let l:next_mode = 1
        elseif match(s:command_backlog, 'c') != -1
            let l:req = {"method": "unguided", "params": a:params}
            let l:req.params.timestamp = a:msg.result.timestamp
            let l:req.params.no_context = v:true
            let resp = ch_sendexpr(g:lsp_job, req, {"callback": function("s:transcriptionCallback", [function("s:subTranProg"), function("s:subTranFinish", [a:params])])})
            return
        endif
        exe "normal" s:command_backlog
        if index(s:c_motion + ["u"],l:command) == -1
            exe "normal a\<C-G>u"
            let s:repeat_command = s:command_backlog
            call s:logCallback(0, s:command_backlog)
        endif
        let s:command_backlog = ""
    endif
    let l:req = {"method": "guided", "params": a:params}
    let l:req.params.timestamp = a:msg.result.timestamp
    let l:req.params.commandset_index = l:next_mode
    let resp = ch_sendexpr(g:lsp_job, l:req, {"callback": function("s:commandCallback",[a:params, l:next_mode])})
endfunction

func s:loadedCallback(channel, msg)
    echo "Loading complete"
    call s:logCallback(a:channel, a:msg)
endfunction

func s:registerCommandset(commandlist, is_final)
    let req = {"method": "registerCommandset"}
    let req.params = a:commandlist
    call s:logCallback(0, join(a:commandlist))
    call add(g:whisper_commandlist_spoken, a:commandlist)
    if a:is_final
        let resp = ch_sendexpr(g:lsp_job, req, {"callback": "s:loadedCallback"})
    else
        let resp = ch_sendexpr(g:lsp_job, req, {"callback": "s:logCallback"})
    endif
endfunction

func s:registerAllCommands()
    let l:normal = s:c_special_always + s:c_special_normal + s:c_count + s:c_command + s:c_motion + keys(s:c_user)
    let l:visual = s:c_special_always + s:c_count + s:c_command + s:c_motion
    " Currently the same as visual.
    " let l:post_command = s:c_special_always + s:c_count + s:c_command + s:c_motion
    let l:single_key = s:c_special_always + split(s:c_lowerkeys, '\zs')
    let l:area = s:c_special_always + s:c_area

    " Used only for compatibility with the testing script
    let g:whisper_commandlist_spoken = []

    let s:commandset_list = [l:normal, l:visual, l:single_key, l:area]
    call s:registerCommandset(s:commandsetToSpoken(l:normal, 0), v:false)
    call s:registerCommandset(s:commandsetToSpoken(l:visual, 1), v:false)
    call s:registerCommandset(s:commandsetToSpoken(l:single_key, 2), v:false)
    call s:registerCommandset(s:commandsetToSpoken(l:area, 3), v:true)
endfunction

func s:commandsetToSpoken(commandset, spoken_index)
    let l:spoken_list = []
    for l:command in a:commandset
        if has_key(s:spoken_dict, l:command)
            let l:spoken_value = s:spoken_dict[l:command]
            if type(l:spoken_value) == v:t_dict
                if has_key(l:spoken_value, a:spoken_index)
                    let l:spoken_value = l:spoken_value[a:spoken_index]
                else
                    if a:spoken_index == 2
                        let l:spoken_value = l:command
                    else
                        let l:spoken_value = l:spoken_value[0]
                    endif
                endif
            else
                if a:spoken_index == 2
                    let l:spoken_value = l:command
                endif
            endif
        else
            let l:spoken_value = l:command
        endif
        call add(l:spoken_list, l:spoken_value)
    endfor
    return l:spoken_list
endfunction

" TODO: Check lifetime. If the script is resourced, is the existing
" s:lsp_job dropped and therefore killed?
" This seems to not be the case and I've had to deal with zombie processes
" that survive exiting vim, even though said behavior conflicts with my
" understanding of the provided documentation
let s:lsp_opts = {"in_mode": "lsp", "out_mode": "lsp", "err_mode": "nl", "err_io": "buffer", "err_buf": s:output_buffer}
if !exists("g:lsp_job")
    if exists("g:whisper_user_commands")
        let s:c_user = g:whisper_user_commands
    else
        let s:c_user = {}
    endif
    let g:lsp_job = job_start(s:lsp_command, s:lsp_opts)
    if job_status(g:lsp_job) == "fail"
        echoerr "Failed to start whisper job"
    endif
    call s:registerAllCommands()
endif