minimal example code location
Some checks failed
Build and Push Docker Image / build-and-push (push) Has been cancelled

This commit is contained in:
Richard Mrasek
2026-06-11 15:19:55 +02:00
parent 9804813929
commit dec5b0d5ea
5 changed files with 73 additions and 3238 deletions

View File

@@ -0,0 +1,45 @@
name: Build and Push Docker Image
on:
push:
branches:
- main
workflow_dispatch:
jobs:
build-and-push:
runs-on: ubuntu-latest
env:
REGISTRY: gitea.dataprovider01.sandbox-cat-dat.simpl-europe.eu
IMAGE_REPO: gitea.dataprovider01.sandbox-cat-dat.simpl-europe.eu/j.r/template-code-location
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Validate registry secrets
run: |
if [ -z "${{ secrets.REGISTRY_USERNAME }}" ] || [ -z "${{ secrets.REGISTRY_PASSWORD }}" ]; then
echo "Missing REGISTRY_USERNAME or REGISTRY_PASSWORD secret"
exit 1
fi
- name: Login to registry
run: |
echo "${{ secrets.REGISTRY_PASSWORD }}" | docker login "${REGISTRY}" \
-u "${{ secrets.REGISTRY_USERNAME }}" --password-stdin
- name: Build image
run: |
COMMIT_SHA="${GITHUB_SHA:-$GITEA_SHA}"
SHORT_SHA="$(echo "${COMMIT_SHA}" | cut -c1-12)"
docker build \
-t "${IMAGE_REPO}:latest" \
-t "${IMAGE_REPO}:${SHORT_SHA}" \
.
- name: Push image tags
run: |
COMMIT_SHA="${GITHUB_SHA:-$GITEA_SHA}"
SHORT_SHA="$(echo "${COMMIT_SHA}" | cut -c1-12)"
docker push "${IMAGE_REPO}:latest"
docker push "${IMAGE_REPO}:${SHORT_SHA}"

View File

@@ -11,16 +11,9 @@ RUN addgroup --gid 1000 appgroup && \
# Install system dependencies:
# - git: required to fetch util-services from GitLab (tool.uv.sources)
# - build-essential / gcc / g++ / python3-dev / cmake: native extensions
# (scrubadub-spacy → spaCy, pycanon, etc.)
# - curl: optional healthcheck / runtime tooling
RUN apt-get update && apt-get upgrade -y \
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
build-essential=12.9 \
cmake=3.25.1-1 \
gcc=4:12.2.0-3 \
g++=4:12.2.0-3 \
python3-dev=3.11.2-1+b1 \
git=1:2.39.5-0+deb12u3 \
curl=7.88.1-10+deb12u14 \
&& apt-get clean \
@@ -28,41 +21,35 @@ RUN apt-get update && apt-get upgrade -y \
&& rm -rf /tmp/* \
&& rm -rf /var/tmp/*
# Pre-own /app so appuser can write to it
RUN chown -R appuser:appgroup /app
# Ensure appuser can create the project virtual environment in /app
RUN chown appuser:appgroup /app
# Copy project metadata and source
COPY pyproject.toml .
COPY uv.lock .
COPY src/ ./src/
COPY --chown=appuser:appgroup pyproject.toml .
COPY --chown=appuser:appgroup uv.lock .
COPY --chown=appuser:appgroup src/ ./src/
# uv environment knobs:
# UV_COMPILE_BYTECODE → compile .pyc files at install time for faster cold start
# UV_COMPILE_BYTECODE → disable .pyc precompile to reduce image size
# UV_LINK_MODE=copy → copy files instead of symlinks (required in Docker layers)
# UV_SYSTEM_PYTHON=1 → install into the system Python (no extra venv needed)
ENV UV_COMPILE_BYTECODE=1
ENV UV_COMPILE_BYTECODE=0
ENV UV_LINK_MODE=copy
ENV UV_SYSTEM_PYTHON=1
# Install the project and all dependencies, respecting [tool.uv.sources]
# (git source for util-services and pytorch-cpu index for torch)
# BuildKit cache mount keeps the uv package cache across builds
RUN --mount=type=cache,target=/root/.cache/uv \
uv sync --frozen --no-dev
USER appuser
RUN --mount=type=cache,target=/home/appuser/.cache/uv,uid=1000,gid=1000 \
uv sync --frozen --no-dev --no-install-package torch
# Put the project's venv on PATH (matches WORKDIR)
ENV PATH="/app/.venv/bin:${PATH}"
ENV PYTHONPATH="/app/src"
# Make /app writable for the non-root user (e.g. spaCy model downloads)
RUN chown -R 1000:1000 /app && chmod -R u+w /app
# Provide a real home directory for appuser
RUN mkdir -p /home/appuser && chown -R 1000:1000 /home/appuser
ENV HOME=/home/appuser
USER appuser
# Sanity-check: fail the build early if the dagster CLI is missing
RUN dagster --version

View File

@@ -10,28 +10,16 @@ requires-python = ">=3.12"
dependencies = [
"dagster>=1.8.13",
"util-services",
"data-processing",
"dataframe-level-anonymisation",
"field-level-pseudo-anonymisation",
]
[tool.uv]
exclude-dependencies = ["transformers", "spacy-transformers"]
exclude-dependencies = ["transformers", "spacy-transformers", "torch"]
override-dependencies = [
"util-services @ git+https://code.europa.eu/simpl/simpl-open/development/data-services/util-services.git@v0.7.0",
]
[tool.uv.sources]
torch = { index = "pytorch-cpu" }
util-services = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/util-services.git", rev = "v0.7.0" }
data-processing = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/data-processing.git", rev = "v0.4.0" }
dataframe-level-anonymisation = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/dataframe-level-anonymisation.git", rev = "v0.6.0" }
field-level-pseudo-anonymisation = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/field-level-pseudo-anonymisation.git", rev = "v0.7.0" }
[[tool.uv.index]]
name = "pytorch-cpu"
url = "https://download.pytorch.org/whl/cpu"
explicit = true
[project.optional-dependencies]
dev = [

View File

@@ -1,70 +1,14 @@
from dagster import Definitions
from util_services.resources import s3_resource
from util_services.sensors import (
notify_success,
notify_failure,
notify_canceled
)
from util_services.custom_json_logger import simpl_json_logger
# Data processing jobs
from data_processing.jobs import (
remove_duplicates_job_s3,
fill_missing_values_job_s3,
standardize_categorical_values_job_s3,
correct_typos_job_s3,
normalize_numeric_min_max_job_s3,
normalize_datetime_job_s3,
normalize_coordinates_job_s3,
add_global_aggregations_job_s3,
filter_dataset_job_s3,
quality_job_s3
)
# Dataframe-level anonymisation jobs
from dataframe_level_anonymisation.jobs import (
k_anonymity_job_s3,
l_diversity_job_s3,
t_closeness_job_s3,
read_write_semistructured_job_s3,
)
# Field-level pseudo-anonymisation jobs
from field_level_pseudo_anonymisation.jobs import (
anonymise_pseudonymise_structured_job_s3,
depseudonymise_structured_job_s3,
anonymise_pseudonymise_unstructured_job_s3,
depseudonymise_unstructured_job_s3,
)
from util_services.resources import s3_resource
from template_code_location.jobs.jobs import data_processing_job
defs = Definitions(
jobs=[
data_processing_job,
# Data processing
remove_duplicates_job_s3,
fill_missing_values_job_s3,
standardize_categorical_values_job_s3,
correct_typos_job_s3,
normalize_numeric_min_max_job_s3,
normalize_datetime_job_s3,
normalize_coordinates_job_s3,
add_global_aggregations_job_s3,
filter_dataset_job_s3,
quality_job_s3,
# Dataframe-level anonymisation
k_anonymity_job_s3,
l_diversity_job_s3,
t_closeness_job_s3,
read_write_semistructured_job_s3,
# Field-level pseudo-anonymisation
anonymise_pseudonymise_structured_job_s3,
depseudonymise_structured_job_s3,
anonymise_pseudonymise_unstructured_job_s3,
depseudonymise_unstructured_job_s3,
data_processing_job
],
sensors=[notify_success, notify_failure, notify_canceled],
sensors=[],
resources={"s3": s3_resource.configured({"resource_name": "selfS3"})},
loggers={"simpl": simpl_json_logger},
)

3153
uv.lock generated

File diff suppressed because it is too large Load Diff