feat: add llama-master backend (#752)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>renovate/github.com-tmc-langchaingo-digest
parent
fb6cce487f
commit
6352448b72
@ -0,0 +1,25 @@ |
||||
package main |
||||
|
||||
// GRPC Falcon server
|
||||
|
||||
// Note: this is started internally by LocalAI and a server is allocated for each model
|
||||
|
||||
import ( |
||||
"flag" |
||||
|
||||
llama "github.com/go-skynet/LocalAI/pkg/grpc/llm/llama-master" |
||||
|
||||
grpc "github.com/go-skynet/LocalAI/pkg/grpc" |
||||
) |
||||
|
||||
var ( |
||||
addr = flag.String("addr", "localhost:50051", "the address to connect to") |
||||
) |
||||
|
||||
func main() { |
||||
flag.Parse() |
||||
|
||||
if err := grpc.StartServer(*addr, &llama.LLM{}); err != nil { |
||||
panic(err) |
||||
} |
||||
} |
@ -0,0 +1,168 @@ |
||||
package llama |
||||
|
||||
// This is a wrapper to statisfy the GRPC service interface
|
||||
// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
|
||||
import ( |
||||
"fmt" |
||||
|
||||
"github.com/go-skynet/LocalAI/pkg/grpc/base" |
||||
pb "github.com/go-skynet/LocalAI/pkg/grpc/proto" |
||||
"github.com/go-skynet/go-llama.cpp-master" |
||||
) |
||||
|
||||
type LLM struct { |
||||
base.Base |
||||
|
||||
llama *llama.LLama |
||||
} |
||||
|
||||
func (llm *LLM) Load(opts *pb.ModelOptions) error { |
||||
llamaOpts := []llama.ModelOption{} |
||||
|
||||
if opts.ContextSize != 0 { |
||||
llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize))) |
||||
} |
||||
if opts.F16Memory { |
||||
llamaOpts = append(llamaOpts, llama.EnableF16Memory) |
||||
} |
||||
if opts.Embeddings { |
||||
llamaOpts = append(llamaOpts, llama.EnableEmbeddings) |
||||
} |
||||
if opts.NGPULayers != 0 { |
||||
llamaOpts = append(llamaOpts, llama.SetGPULayers(int(opts.NGPULayers))) |
||||
} |
||||
|
||||
llamaOpts = append(llamaOpts, llama.SetMMap(opts.MMap)) |
||||
llamaOpts = append(llamaOpts, llama.SetMainGPU(opts.MainGPU)) |
||||
llamaOpts = append(llamaOpts, llama.SetTensorSplit(opts.TensorSplit)) |
||||
if opts.NBatch != 0 { |
||||
llamaOpts = append(llamaOpts, llama.SetNBatch(int(opts.NBatch))) |
||||
} else { |
||||
llamaOpts = append(llamaOpts, llama.SetNBatch(512)) |
||||
} |
||||
|
||||
if opts.NUMA { |
||||
llamaOpts = append(llamaOpts, llama.EnableNUMA) |
||||
} |
||||
|
||||
if opts.LowVRAM { |
||||
llamaOpts = append(llamaOpts, llama.EnabelLowVRAM) |
||||
} |
||||
|
||||
model, err := llama.New(opts.Model, llamaOpts...) |
||||
llm.llama = model |
||||
return err |
||||
} |
||||
|
||||
func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption { |
||||
predictOptions := []llama.PredictOption{ |
||||
llama.SetTemperature(float64(opts.Temperature)), |
||||
llama.SetTopP(float64(opts.TopP)), |
||||
llama.SetTopK(int(opts.TopK)), |
||||
llama.SetTokens(int(opts.Tokens)), |
||||
llama.SetThreads(int(opts.Threads)), |
||||
} |
||||
|
||||
if opts.PromptCacheAll { |
||||
predictOptions = append(predictOptions, llama.EnablePromptCacheAll) |
||||
} |
||||
|
||||
if opts.PromptCacheRO { |
||||
predictOptions = append(predictOptions, llama.EnablePromptCacheRO) |
||||
} |
||||
|
||||
// Expected absolute path
|
||||
if opts.PromptCachePath != "" { |
||||
predictOptions = append(predictOptions, llama.SetPathPromptCache(opts.PromptCachePath)) |
||||
} |
||||
|
||||
if opts.Mirostat != 0 { |
||||
predictOptions = append(predictOptions, llama.SetMirostat(int(opts.Mirostat))) |
||||
} |
||||
|
||||
if opts.MirostatETA != 0 { |
||||
predictOptions = append(predictOptions, llama.SetMirostatETA(float64(opts.MirostatETA))) |
||||
} |
||||
|
||||
if opts.MirostatTAU != 0 { |
||||
predictOptions = append(predictOptions, llama.SetMirostatTAU(float64(opts.MirostatTAU))) |
||||
} |
||||
|
||||
if opts.Debug { |
||||
predictOptions = append(predictOptions, llama.Debug) |
||||
} |
||||
|
||||
predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...)) |
||||
|
||||
if opts.PresencePenalty != 0 { |
||||
predictOptions = append(predictOptions, llama.SetPenalty(float64(opts.PresencePenalty))) |
||||
} |
||||
|
||||
if opts.NKeep != 0 { |
||||
predictOptions = append(predictOptions, llama.SetNKeep(int(opts.NKeep))) |
||||
} |
||||
|
||||
if opts.Batch != 0 { |
||||
predictOptions = append(predictOptions, llama.SetBatch(int(opts.Batch))) |
||||
} |
||||
|
||||
if opts.F16KV { |
||||
predictOptions = append(predictOptions, llama.EnableF16KV) |
||||
} |
||||
|
||||
if opts.IgnoreEOS { |
||||
predictOptions = append(predictOptions, llama.IgnoreEOS) |
||||
} |
||||
|
||||
if opts.Seed != 0 { |
||||
predictOptions = append(predictOptions, llama.SetSeed(int(opts.Seed))) |
||||
} |
||||
|
||||
//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed))
|
||||
|
||||
predictOptions = append(predictOptions, llama.SetFrequencyPenalty(float64(opts.FrequencyPenalty))) |
||||
predictOptions = append(predictOptions, llama.SetMlock(opts.MLock)) |
||||
predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap)) |
||||
predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU)) |
||||
predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit)) |
||||
predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(float64(opts.TailFreeSamplingZ))) |
||||
predictOptions = append(predictOptions, llama.SetTypicalP(float64(opts.TypicalP))) |
||||
return predictOptions |
||||
} |
||||
|
||||
func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) { |
||||
return llm.llama.Predict(opts.Prompt, buildPredictOptions(opts)...) |
||||
} |
||||
|
||||
func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error { |
||||
predictOptions := buildPredictOptions(opts) |
||||
|
||||
predictOptions = append(predictOptions, llama.SetTokenCallback(func(token string) bool { |
||||
results <- token |
||||
return true |
||||
})) |
||||
|
||||
go func() { |
||||
_, err := llm.llama.Predict(opts.Prompt, predictOptions...) |
||||
if err != nil { |
||||
fmt.Println("err: ", err) |
||||
} |
||||
close(results) |
||||
}() |
||||
|
||||
return nil |
||||
} |
||||
|
||||
func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) { |
||||
predictOptions := buildPredictOptions(opts) |
||||
|
||||
if len(opts.EmbeddingTokens) > 0 { |
||||
tokens := []int{} |
||||
for _, t := range opts.EmbeddingTokens { |
||||
tokens = append(tokens, int(t)) |
||||
} |
||||
return llm.llama.TokenEmbeddings(tokens, predictOptions...) |
||||
} |
||||
|
||||
return llm.llama.Embeddings(opts.Embeddings, predictOptions...) |
||||
} |
Loading…
Reference in new issue