minimal example code location
Some checks failed
Build and Push Docker Image / build-and-push (push) Has been cancelled
Some checks failed
Build and Push Docker Image / build-and-push (push) Has been cancelled
This commit is contained in:
45
.gitea/workflows/docker-publish.yml
Normal file
45
.gitea/workflows/docker-publish.yml
Normal file
@@ -0,0 +1,45 @@
|
||||
name: Build and Push Docker Image
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
build-and-push:
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
REGISTRY: gitea.dataprovider01.sandbox-cat-dat.simpl-europe.eu
|
||||
IMAGE_REPO: gitea.dataprovider01.sandbox-cat-dat.simpl-europe.eu/j.r/template-code-location
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Validate registry secrets
|
||||
run: |
|
||||
if [ -z "${{ secrets.REGISTRY_USERNAME }}" ] || [ -z "${{ secrets.REGISTRY_PASSWORD }}" ]; then
|
||||
echo "Missing REGISTRY_USERNAME or REGISTRY_PASSWORD secret"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Login to registry
|
||||
run: |
|
||||
echo "${{ secrets.REGISTRY_PASSWORD }}" | docker login "${REGISTRY}" \
|
||||
-u "${{ secrets.REGISTRY_USERNAME }}" --password-stdin
|
||||
|
||||
- name: Build image
|
||||
run: |
|
||||
COMMIT_SHA="${GITHUB_SHA:-$GITEA_SHA}"
|
||||
SHORT_SHA="$(echo "${COMMIT_SHA}" | cut -c1-12)"
|
||||
docker build \
|
||||
-t "${IMAGE_REPO}:latest" \
|
||||
-t "${IMAGE_REPO}:${SHORT_SHA}" \
|
||||
.
|
||||
|
||||
- name: Push image tags
|
||||
run: |
|
||||
COMMIT_SHA="${GITHUB_SHA:-$GITEA_SHA}"
|
||||
SHORT_SHA="$(echo "${COMMIT_SHA}" | cut -c1-12)"
|
||||
docker push "${IMAGE_REPO}:latest"
|
||||
docker push "${IMAGE_REPO}:${SHORT_SHA}"
|
||||
37
Dockerfile
37
Dockerfile
@@ -11,16 +11,9 @@ RUN addgroup --gid 1000 appgroup && \
|
||||
|
||||
# Install system dependencies:
|
||||
# - git: required to fetch util-services from GitLab (tool.uv.sources)
|
||||
# - build-essential / gcc / g++ / python3-dev / cmake: native extensions
|
||||
# (scrubadub-spacy → spaCy, pycanon, etc.)
|
||||
# - curl: optional healthcheck / runtime tooling
|
||||
RUN apt-get update && apt-get upgrade -y \
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
build-essential=12.9 \
|
||||
cmake=3.25.1-1 \
|
||||
gcc=4:12.2.0-3 \
|
||||
g++=4:12.2.0-3 \
|
||||
python3-dev=3.11.2-1+b1 \
|
||||
git=1:2.39.5-0+deb12u3 \
|
||||
curl=7.88.1-10+deb12u14 \
|
||||
&& apt-get clean \
|
||||
@@ -28,41 +21,35 @@ RUN apt-get update && apt-get upgrade -y \
|
||||
&& rm -rf /tmp/* \
|
||||
&& rm -rf /var/tmp/*
|
||||
|
||||
# Pre-own /app so appuser can write to it
|
||||
RUN chown -R appuser:appgroup /app
|
||||
# Ensure appuser can create the project virtual environment in /app
|
||||
RUN chown appuser:appgroup /app
|
||||
|
||||
# Copy project metadata and source
|
||||
COPY pyproject.toml .
|
||||
COPY uv.lock .
|
||||
COPY src/ ./src/
|
||||
COPY --chown=appuser:appgroup pyproject.toml .
|
||||
COPY --chown=appuser:appgroup uv.lock .
|
||||
COPY --chown=appuser:appgroup src/ ./src/
|
||||
|
||||
# uv environment knobs:
|
||||
# UV_COMPILE_BYTECODE → compile .pyc files at install time for faster cold start
|
||||
# UV_COMPILE_BYTECODE → disable .pyc precompile to reduce image size
|
||||
# UV_LINK_MODE=copy → copy files instead of symlinks (required in Docker layers)
|
||||
# UV_SYSTEM_PYTHON=1 → install into the system Python (no extra venv needed)
|
||||
ENV UV_COMPILE_BYTECODE=1
|
||||
ENV UV_COMPILE_BYTECODE=0
|
||||
ENV UV_LINK_MODE=copy
|
||||
ENV UV_SYSTEM_PYTHON=1
|
||||
|
||||
# Install the project and all dependencies, respecting [tool.uv.sources]
|
||||
# (git source for util-services and pytorch-cpu index for torch)
|
||||
# BuildKit cache mount keeps the uv package cache across builds
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv sync --frozen --no-dev
|
||||
USER appuser
|
||||
|
||||
RUN --mount=type=cache,target=/home/appuser/.cache/uv,uid=1000,gid=1000 \
|
||||
uv sync --frozen --no-dev --no-install-package torch
|
||||
|
||||
# Put the project's venv on PATH (matches WORKDIR)
|
||||
ENV PATH="/app/.venv/bin:${PATH}"
|
||||
ENV PYTHONPATH="/app/src"
|
||||
|
||||
# Make /app writable for the non-root user (e.g. spaCy model downloads)
|
||||
RUN chown -R 1000:1000 /app && chmod -R u+w /app
|
||||
|
||||
# Provide a real home directory for appuser
|
||||
RUN mkdir -p /home/appuser && chown -R 1000:1000 /home/appuser
|
||||
ENV HOME=/home/appuser
|
||||
|
||||
USER appuser
|
||||
|
||||
# Sanity-check: fail the build early if the dagster CLI is missing
|
||||
RUN dagster --version
|
||||
|
||||
|
||||
@@ -10,28 +10,16 @@ requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
"dagster>=1.8.13",
|
||||
"util-services",
|
||||
"data-processing",
|
||||
"dataframe-level-anonymisation",
|
||||
"field-level-pseudo-anonymisation",
|
||||
]
|
||||
|
||||
[tool.uv]
|
||||
exclude-dependencies = ["transformers", "spacy-transformers"]
|
||||
exclude-dependencies = ["transformers", "spacy-transformers", "torch"]
|
||||
override-dependencies = [
|
||||
"util-services @ git+https://code.europa.eu/simpl/simpl-open/development/data-services/util-services.git@v0.7.0",
|
||||
]
|
||||
|
||||
[tool.uv.sources]
|
||||
torch = { index = "pytorch-cpu" }
|
||||
util-services = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/util-services.git", rev = "v0.7.0" }
|
||||
data-processing = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/data-processing.git", rev = "v0.4.0" }
|
||||
dataframe-level-anonymisation = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/dataframe-level-anonymisation.git", rev = "v0.6.0" }
|
||||
field-level-pseudo-anonymisation = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/field-level-pseudo-anonymisation.git", rev = "v0.7.0" }
|
||||
|
||||
[[tool.uv.index]]
|
||||
name = "pytorch-cpu"
|
||||
url = "https://download.pytorch.org/whl/cpu"
|
||||
explicit = true
|
||||
|
||||
[project.optional-dependencies]
|
||||
dev = [
|
||||
|
||||
@@ -1,70 +1,14 @@
|
||||
from dagster import Definitions
|
||||
from util_services.resources import s3_resource
|
||||
from util_services.sensors import (
|
||||
notify_success,
|
||||
notify_failure,
|
||||
notify_canceled
|
||||
)
|
||||
from util_services.custom_json_logger import simpl_json_logger
|
||||
|
||||
# Data processing jobs
|
||||
from data_processing.jobs import (
|
||||
remove_duplicates_job_s3,
|
||||
fill_missing_values_job_s3,
|
||||
standardize_categorical_values_job_s3,
|
||||
correct_typos_job_s3,
|
||||
normalize_numeric_min_max_job_s3,
|
||||
normalize_datetime_job_s3,
|
||||
normalize_coordinates_job_s3,
|
||||
add_global_aggregations_job_s3,
|
||||
filter_dataset_job_s3,
|
||||
quality_job_s3
|
||||
)
|
||||
|
||||
# Dataframe-level anonymisation jobs
|
||||
from dataframe_level_anonymisation.jobs import (
|
||||
k_anonymity_job_s3,
|
||||
l_diversity_job_s3,
|
||||
t_closeness_job_s3,
|
||||
read_write_semistructured_job_s3,
|
||||
)
|
||||
|
||||
# Field-level pseudo-anonymisation jobs
|
||||
from field_level_pseudo_anonymisation.jobs import (
|
||||
anonymise_pseudonymise_structured_job_s3,
|
||||
depseudonymise_structured_job_s3,
|
||||
anonymise_pseudonymise_unstructured_job_s3,
|
||||
depseudonymise_unstructured_job_s3,
|
||||
)
|
||||
from util_services.resources import s3_resource
|
||||
|
||||
from template_code_location.jobs.jobs import data_processing_job
|
||||
|
||||
defs = Definitions(
|
||||
jobs=[
|
||||
data_processing_job,
|
||||
# Data processing
|
||||
remove_duplicates_job_s3,
|
||||
fill_missing_values_job_s3,
|
||||
standardize_categorical_values_job_s3,
|
||||
correct_typos_job_s3,
|
||||
normalize_numeric_min_max_job_s3,
|
||||
normalize_datetime_job_s3,
|
||||
normalize_coordinates_job_s3,
|
||||
add_global_aggregations_job_s3,
|
||||
filter_dataset_job_s3,
|
||||
quality_job_s3,
|
||||
# Dataframe-level anonymisation
|
||||
k_anonymity_job_s3,
|
||||
l_diversity_job_s3,
|
||||
t_closeness_job_s3,
|
||||
read_write_semistructured_job_s3,
|
||||
# Field-level pseudo-anonymisation
|
||||
anonymise_pseudonymise_structured_job_s3,
|
||||
depseudonymise_structured_job_s3,
|
||||
anonymise_pseudonymise_unstructured_job_s3,
|
||||
depseudonymise_unstructured_job_s3,
|
||||
data_processing_job
|
||||
],
|
||||
sensors=[notify_success, notify_failure, notify_canceled],
|
||||
sensors=[],
|
||||
resources={"s3": s3_resource.configured({"resource_name": "selfS3"})},
|
||||
loggers={"simpl": simpl_json_logger},
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user