bigcode-evaluation-harness/makefile at main · neehar18/bigcode-evaluation-harness · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# There are two dockerfiles: for all benchmarks, and for MultiPL-E
DOCKERFILE=Dockerfile

ifeq ($(DOCKERFILE), Dockerfile)
	IMAGE_NAME=evaluation-harness
else
	IMAGE_NAME=evaluation-harness-multiple
endif

build:
	docker build -f $(DOCKERFILE) -t $(IMAGE_NAME) .

test:
	docker run -v $(CURDIR)/tests/docker_test/test_generations.json:/app/test_generations.json:ro \
	-it $(IMAGE_NAME) python3 main.py --model dummy_model --tasks humaneval --limit 4 \
	--load_generations_path /app/test_generations.json --allow_code_execution

	@echo "If pass@1 is 0.25 then your configuration for standard benchmarks is correct"

all: build test