Serving Private & Gated Models
Serving Private & Gated Models
export HUGGING_FACE_HUB_TOKEN=<YOUR READ TOKEN>model=meta-llama/Llama-2-7b-chat-hf
volume=$PWD/data
token=<your READ token>
docker run --gpus all \
--shm-size 1g \
-e HUGGING_FACE_HUB_TOKEN=$token \
-p 8080:80 \
-v $volume:/data ghcr.io/huggingface/text-generation-inference:1.1.1 \
--model-id $modelLast updated