Compare commits
89 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 01fb721162 | |||
| db85eb865f | |||
| 1dd776a410 | |||
| cfdeef41cb | |||
| e92e98a936 | |||
| 7e743c1157 | |||
| 7c028ef02f | |||
| e0ba120da4 | |||
| 5a7f568ba6 | |||
| 229f6953aa | |||
| a814fbf4dc | |||
| 2e7bf4a0d5 | |||
| 1354b30863 | |||
| f5c7236767 | |||
| 7e15258536 | |||
| a868085ce3 | |||
| 6afb2523e3 | |||
| 7ec7e97801 | |||
| 17369559fe | |||
| c89bb8a389 | |||
| 2fadac0176 | |||
| 84ae56368b | |||
| e338459f66 | |||
| 18bd8f039a | |||
| e46c826c01 | |||
|
|
60096f840d | ||
| ae83ac8a99 | |||
| d569f43dcc | |||
| 3485844c5b | |||
|
|
77f2acb949 | ||
|
|
5500dd7d03 | ||
| e041204047 | |||
| 88d4b9e883 | |||
|
|
af57ed81ca | ||
|
|
9188fe7302 | ||
|
|
435d316258 | ||
|
|
452e5cfe0c | ||
|
|
ffe20d285c | ||
|
|
940c090c7a | ||
|
|
e85c36715a | ||
|
|
874477bbc1 | ||
|
|
7666731f66 | ||
|
|
b8b3c8c41d | ||
| 42c4c77215 | |||
| ae11bc7a10 | |||
|
|
c1aa7714e6 | ||
|
|
52d2bb1812 | ||
|
|
a9946102ed | ||
|
|
a8a97028c1 | ||
|
|
eae07a7246 | ||
|
|
acab929fe9 | ||
|
|
8e667848c9 | ||
|
|
9acae349c4 | ||
|
|
f4e7e79ec1 | ||
|
|
2e1ea8e432 | ||
|
|
b8b42bb3fa | ||
|
|
dd71829735 | ||
|
|
a63675c36d | ||
|
|
58600a3767 | ||
|
|
89601cedeb | ||
|
|
f1cfc8fa83 | ||
|
|
dec5b0d5ea | ||
|
|
9804813929 | ||
|
|
a29c3877a9 | ||
|
|
e735992048 | ||
|
|
7c67371bb0 | ||
|
|
56c8170585 | ||
|
|
d3b7d17ccd | ||
|
|
5760f9424a | ||
|
|
b19182d1d7 | ||
|
|
63025e83c8 | ||
|
|
4ece2f3274 | ||
|
|
e168eebff3 | ||
|
|
5d691fe3cf | ||
|
|
dc0774599d | ||
|
|
9553231c98 | ||
|
|
0701bdb9fd | ||
|
|
75408a05ee | ||
|
|
e684083423 | ||
|
|
4723555e31 | ||
|
|
e55c66b94e | ||
|
|
9e3ff7d4ac | ||
|
|
c8a3657126 | ||
|
|
1b154aa2d7 | ||
|
|
bfc22e594a | ||
|
|
39677b9686 | ||
|
|
ea757b632a | ||
|
|
96a8a8e4eb | ||
|
|
81bb3299a4 |
113
.gitea/workflows/check-deleted-workflows.yml
Normal file
113
.gitea/workflows/check-deleted-workflows.yml
Normal file
@@ -0,0 +1,113 @@
|
||||
name: Check Deleted Workflows
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
types:
|
||||
- opened
|
||||
- synchronize
|
||||
- reopened
|
||||
- ready_for_review
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
check-deleted-workflows:
|
||||
runs-on: orchestration-platform
|
||||
container:
|
||||
image: python:3.12-slim
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
steps:
|
||||
- name: Install git
|
||||
run: |
|
||||
apt-get update -qq
|
||||
apt-get install -y --no-install-recommends git
|
||||
|
||||
- name: Checkout repository
|
||||
run: |
|
||||
REPO_DIR="repo"
|
||||
REPO_CLONE_URL="https://gitea.dataprovider01.sandbox-cat-dat.simpl-europe.eu/dataprovider01/template-code-location.git"
|
||||
CLONE_USER="${{ secrets.REGISTRY_USERNAME }}"
|
||||
CLONE_PASS="${{ secrets.REGISTRY_PASSWORD }}"
|
||||
|
||||
if [ -z "${CLONE_USER}" ] || [ -z "${CLONE_PASS}" ]; then
|
||||
echo "Missing REGISTRY_USERNAME or REGISTRY_PASSWORD secret"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
rm -rf "${REPO_DIR}"
|
||||
AUTH_HEADER="$(printf '%s:%s' "${CLONE_USER}" "${CLONE_PASS}" | base64 | tr -d '\n')"
|
||||
HEAD_BRANCH="${GITHUB_HEAD_REF:-${GITHUB_REF_NAME:-develop}}"
|
||||
git clone \
|
||||
--branch "${HEAD_BRANCH}" \
|
||||
-c "http.extraHeader=Authorization: Basic ${AUTH_HEADER}" \
|
||||
"${REPO_CLONE_URL}" \
|
||||
"${REPO_DIR}"
|
||||
|
||||
- name: Install runtime tools
|
||||
run: |
|
||||
apt-get update -qq
|
||||
apt-get install -y --no-install-recommends git jq curl gcc librdkafka-dev libpython3-dev
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
ln -sf "${HOME}/.local/bin/uv" /usr/local/bin/uv
|
||||
|
||||
- name: Install project dependencies
|
||||
run: |
|
||||
cd repo
|
||||
uv sync --frozen --no-dev --no-install-package torch
|
||||
|
||||
- name: Compute deleted workflows/jobs against main
|
||||
run: |
|
||||
cd repo
|
||||
PATH="$PWD/.venv/bin:$PATH" \
|
||||
BASE_REF="${GITHUB_BASE_REF:-main}" \
|
||||
HEAD_REF="${GITHUB_HEAD_REF:-HEAD}" \
|
||||
REPOSITORY_FILE="src/template_code_location/repository.py" \
|
||||
DIFF_OUTPUT="deleted_workflows.txt" \
|
||||
FAIL_ON_DELETION="false" \
|
||||
bash .gitea/workflows/list_jobs.sh
|
||||
|
||||
- name: Compute active workflows list
|
||||
run: |
|
||||
cd repo
|
||||
PATH="$PWD/.venv/bin:$PATH" \
|
||||
ONLY_ACTIVE="true" \
|
||||
REGISTRY_USERNAME="${{ secrets.REGISTRY_USERNAME }}" \
|
||||
REGISTRY_PASSWORD="${{ secrets.REGISTRY_PASSWORD }}" \
|
||||
bash .gitea/workflows/check_active_workflows.sh > active_workflows.txt
|
||||
echo "--- Active workflows ---"
|
||||
cat active_workflows.txt || echo "(none)"
|
||||
|
||||
- name: Fail only on overlap with active workflows
|
||||
run: |
|
||||
cd repo
|
||||
|
||||
echo "--- Deleted workflows ---"
|
||||
cat deleted_workflows.txt 2>/dev/null || echo "(none)"
|
||||
|
||||
if [ ! -s deleted_workflows.txt ]; then
|
||||
echo "No deleted workflows/jobs found."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [ ! -s active_workflows.txt ]; then
|
||||
echo "Active workflows list is empty; no overlap to block on."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
awk 'length($0) > 0' deleted_workflows.txt | sort -u > deleted_normalized.txt
|
||||
awk 'length($0) > 0' active_workflows.txt | sort -u > active_normalized.txt
|
||||
comm -12 deleted_normalized.txt active_normalized.txt > overlapping_workflows.txt
|
||||
|
||||
if [ -s overlapping_workflows.txt ]; then
|
||||
echo "------------------------------------------------"
|
||||
echo "DELETED ACTIVE WORKFLOWS DETECTED"
|
||||
echo "The following deleted workflows/jobs are currently active:"
|
||||
cat overlapping_workflows.txt
|
||||
echo "------------------------------------------------"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "No overlap between deleted workflows/jobs and active workflows."
|
||||
66
.gitea/workflows/check_active_workflows.sh
Normal file
66
.gitea/workflows/check_active_workflows.sh
Normal file
@@ -0,0 +1,66 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# Default configuration (override via env vars).
|
||||
BASE_URL="${BASE_URL:-https://participant.be.dataprovider01.sandbox-cat-dat.simpl-europe.eu}"
|
||||
AUTH_BASE="${AUTH_BASE:-${BASE_URL}/auth}"
|
||||
REALM="${REALM:-participant}"
|
||||
USERNAME="${USERNAME:-${REGISTRY_USERNAME:-}}"
|
||||
PASSWORD="${PASSWORD:-${REGISTRY_PASSWORD:-}}"
|
||||
CLIENT_ID="${CLIENT_ID:-frontend-cli}"
|
||||
WORKFLOW_URL="${WORKFLOW_URL:-${BASE_URL}/asset-orchestrator/v1/workflowDefinitions}"
|
||||
ONLY_ACTIVE="${ONLY_ACTIVE:-true}"
|
||||
|
||||
TOKEN_URL="${AUTH_BASE}/realms/${REALM}/protocol/openid-connect/token"
|
||||
|
||||
error() {
|
||||
printf "%s\n" "$1" >&2
|
||||
}
|
||||
|
||||
command -v jq >/dev/null 2>&1 || {
|
||||
error "jq is required"
|
||||
exit 1
|
||||
}
|
||||
|
||||
TOKEN_RESPONSE=$(curl -sS -X POST "${TOKEN_URL}" \
|
||||
-H "Content-Type: application/x-www-form-urlencoded" \
|
||||
--data-urlencode "grant_type=password" \
|
||||
--data-urlencode "client_id=${CLIENT_ID}" \
|
||||
--data-urlencode "username=${USERNAME}" \
|
||||
--data-urlencode "password=${PASSWORD}")
|
||||
|
||||
ACCESS_TOKEN=$(printf '%s' "$TOKEN_RESPONSE" | jq -r '.access_token // empty')
|
||||
|
||||
if [ -z "${USERNAME:-}" ] || [ -z "${PASSWORD:-}" ]; then
|
||||
error "USERNAME/PASSWORD (or REGISTRY_USERNAME/REGISTRY_PASSWORD) must be set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -z "$ACCESS_TOKEN" ]; then
|
||||
error "Failed to obtain access token"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
TMP_BODY=$(mktemp)
|
||||
trap 'rm -f "$TMP_BODY"' EXIT
|
||||
|
||||
HTTP_STATUS=$(curl -sS -o "$TMP_BODY" -w "%{http_code}" \
|
||||
-X GET --get "${WORKFLOW_URL}" \
|
||||
--data-urlencode "onlyActive=${ONLY_ACTIVE}" \
|
||||
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
|
||||
-H "Accept: application/json")
|
||||
|
||||
BODY=$(cat "$TMP_BODY")
|
||||
|
||||
if [ "$HTTP_STATUS" -lt 200 ] || [ "$HTTP_STATUS" -ge 300 ]; then
|
||||
error "Workflow API call failed with HTTP ${HTTP_STATUS}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Remove invalid ASCII control chars except TAB/LF/CR, then extract unique job names.
|
||||
printf '%s' "$BODY" \
|
||||
| tr -d '\000-\010\013\014\016-\037' \
|
||||
| jq -r '.. | objects | .jobName? // empty' \
|
||||
| sed 's/^[[:space:]]*//; s/[[:space:]]*$//' \
|
||||
| awk 'length($0) > 0 && !seen[$0]++'
|
||||
|
||||
112
.gitea/workflows/docker-publish.yml
Normal file
112
.gitea/workflows/docker-publish.yml
Normal file
@@ -0,0 +1,112 @@
|
||||
name: Build and Push Docker Image
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
build-and-push:
|
||||
runs-on: orchestration-platform
|
||||
defaults:
|
||||
run:
|
||||
shell: sh
|
||||
env:
|
||||
REGISTRY: gitea.dataprovider01.sandbox-cat-dat.simpl-europe.eu
|
||||
IMAGE_REPO: gitea.dataprovider01.sandbox-cat-dat.simpl-europe.eu/dataprovider01/template-code-location
|
||||
K8S_NAMESPACE: dataprovider01
|
||||
HELM_RELEASE: dataprovider01-dataprovider-orchestration-platform
|
||||
DAGSTER_CHART_VERSION: 0.2.0
|
||||
steps:
|
||||
- name: Checkout repository (shell)
|
||||
run: |
|
||||
REPO_DIR="repo"
|
||||
REPO_CLONE_URL="https://gitea.dataprovider01.sandbox-cat-dat.simpl-europe.eu/dataprovider01/template-code-location.git"
|
||||
CLONE_USER="${{ secrets.REGISTRY_USERNAME }}"
|
||||
CLONE_PASS="${{ secrets.REGISTRY_PASSWORD }}"
|
||||
REF_NAME="${GITHUB_REF_NAME}"
|
||||
if [ -z "${REF_NAME}" ]; then
|
||||
REF_NAME="${GITHUB_REF#refs/heads/}"
|
||||
fi
|
||||
|
||||
if [ -z "${CLONE_USER}" ] || [ -z "${CLONE_PASS}" ]; then
|
||||
echo "Missing REGISTRY_USERNAME or REGISTRY_PASSWORD secret"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
rm -rf "${REPO_DIR}"
|
||||
AUTH_HEADER="$(printf '%s:%s' "${CLONE_USER}" "${CLONE_PASS}" | base64 | tr -d '\n')"
|
||||
git clone --depth 1 --branch "${REF_NAME}" \
|
||||
-c "http.extraHeader=Authorization: Basic ${AUTH_HEADER}" \
|
||||
"${REPO_CLONE_URL}" \
|
||||
"${REPO_DIR}"
|
||||
|
||||
if [ ! -f "${REPO_DIR}/Dockerfile" ]; then
|
||||
echo "Dockerfile not found after clone"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Validate registry secrets
|
||||
run: |
|
||||
if [ -z "${{ secrets.REGISTRY_USERNAME }}" ] || [ -z "${{ secrets.REGISTRY_PASSWORD }}" ]; then
|
||||
echo "Missing REGISTRY_USERNAME or REGISTRY_PASSWORD secret"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Login to registry
|
||||
run: |
|
||||
echo "${{ secrets.REGISTRY_PASSWORD }}" | docker login "${REGISTRY}" \
|
||||
-u "${{ secrets.REGISTRY_USERNAME }}" --password-stdin
|
||||
|
||||
- name: Build image
|
||||
run: |
|
||||
COMMIT_SHA="${GITHUB_SHA:-$GITEA_SHA}"
|
||||
SHORT_SHA="$(echo "${COMMIT_SHA}" | cut -c1-12)"
|
||||
cd repo
|
||||
docker build \
|
||||
-t "${IMAGE_REPO}:latest" \
|
||||
-t "${IMAGE_REPO}:${SHORT_SHA}" \
|
||||
.
|
||||
|
||||
- name: Validate Dagster runtime imports
|
||||
run: |
|
||||
COMMIT_SHA="${GITHUB_SHA:-$GITEA_SHA}"
|
||||
SHORT_SHA="$(echo "${COMMIT_SHA}" | cut -c1-12)"
|
||||
docker run --rm "${IMAGE_REPO}:${SHORT_SHA}" \
|
||||
python -c "import dagster_postgres; print('dagster_postgres import OK')"
|
||||
|
||||
- name: Push image tags
|
||||
run: |
|
||||
COMMIT_SHA="${GITHUB_SHA:-$GITEA_SHA}"
|
||||
SHORT_SHA="$(echo "${COMMIT_SHA}" | cut -c1-12)"
|
||||
docker push "${IMAGE_REPO}:latest"
|
||||
docker push "${IMAGE_REPO}:${SHORT_SHA}"
|
||||
|
||||
- name: Install kubectl
|
||||
run: |
|
||||
apk add --no-cache kubectl
|
||||
|
||||
- name: Setup kubectl
|
||||
run: |
|
||||
mkdir -p "${HOME}/.kube"
|
||||
echo "${{ secrets.KUBE_CONFIG_B64 }}" | base64 -d > "${HOME}/.kube/config"
|
||||
chmod 600 "${HOME}/.kube/config"
|
||||
|
||||
- name: Update Dagster user deployment image
|
||||
run: |
|
||||
COMMIT_SHA="${GITHUB_SHA:-$GITEA_SHA}"
|
||||
SHORT_SHA="$(echo "${COMMIT_SHA}" | cut -c1-12)"
|
||||
|
||||
DEPLOYMENT_NAME="dataprovider01-dataprovider-orchestration-platform-dagster-user-template-code-location"
|
||||
|
||||
# Keep both the code server image and DAGSTER_CURRENT_IMAGE in sync.
|
||||
kubectl patch deployment "${DEPLOYMENT_NAME}" \
|
||||
-n "${K8S_NAMESPACE}" \
|
||||
--type='strategic' \
|
||||
-p="{\"spec\":{\"template\":{\"spec\":{\"containers\":[{\"name\":\"dagster-user-deployments\",\"image\":\"${IMAGE_REPO}:${SHORT_SHA}\",\"env\":[{\"name\":\"DAGSTER_CURRENT_IMAGE\",\"value\":\"${IMAGE_REPO}:${SHORT_SHA}\"}]}]}}}}"
|
||||
|
||||
# Wait for rollout to complete
|
||||
kubectl rollout status deployment/"${DEPLOYMENT_NAME}" \
|
||||
-n "${K8S_NAMESPACE}" \
|
||||
--timeout=5m
|
||||
79
.gitea/workflows/list_jobs.sh
Normal file
79
.gitea/workflows/list_jobs.sh
Normal file
@@ -0,0 +1,79 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# --- Configuration (override with env vars in CI) ---
|
||||
DIFF_OUTPUT="${DIFF_OUTPUT:-deleted_workflows.txt}"
|
||||
BASE_REF="${BASE_REF:-${GITHUB_BASE_REF:-main}}"
|
||||
HEAD_REF="${HEAD_REF:-${GITHUB_HEAD_REF:-HEAD}}"
|
||||
REPOSITORY_FILE="${REPOSITORY_FILE:-src/template_code_location/repository.py}"
|
||||
FAIL_ON_DELETION="${FAIL_ON_DELETION:-true}"
|
||||
|
||||
TMP_DIR="$(mktemp -d)"
|
||||
BASE_DIR="${TMP_DIR}/base"
|
||||
HEAD_DIR="${TMP_DIR}/head"
|
||||
BASE_JOBS_FILE="${TMP_DIR}/base_jobs.txt"
|
||||
HEAD_JOBS_FILE="${TMP_DIR}/head_jobs.txt"
|
||||
|
||||
export DAGSTER_HOME="$(mktemp -d)"
|
||||
|
||||
cleanup() {
|
||||
git worktree remove -f "${BASE_DIR}" >/dev/null 2>&1 || true
|
||||
git worktree remove -f "${HEAD_DIR}" >/dev/null 2>&1 || true
|
||||
rm -rf "${TMP_DIR}" "${DAGSTER_HOME}"
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
get_jobs_for_ref() {
|
||||
local workdir="$1"
|
||||
|
||||
(
|
||||
cd "${workdir}"
|
||||
PYTHONPATH="${workdir}/src${PYTHONPATH:+:${PYTHONPATH}}" \
|
||||
dagster job list -f "${REPOSITORY_FILE}" 2>/dev/null | \
|
||||
grep '^Job: ' | \
|
||||
awk '{print $2}' | \
|
||||
sort -u
|
||||
)
|
||||
}
|
||||
|
||||
echo "Fetching refs from origin..."
|
||||
git fetch origin --quiet
|
||||
|
||||
if ! git rev-parse --verify "${BASE_REF}" >/dev/null 2>&1; then
|
||||
git fetch origin --quiet "${BASE_REF}:${BASE_REF}"
|
||||
fi
|
||||
|
||||
if [ "${HEAD_REF}" != "HEAD" ] && ! git rev-parse --verify "${HEAD_REF}" >/dev/null 2>&1; then
|
||||
git fetch origin --quiet "${HEAD_REF}:${HEAD_REF}"
|
||||
fi
|
||||
|
||||
echo "Preparing worktrees for ${BASE_REF} and ${HEAD_REF}..."
|
||||
git worktree add --quiet "${BASE_DIR}" "${BASE_REF}"
|
||||
# Use detached commit SHA for HEAD to avoid "already used by worktree" error
|
||||
HEAD_SHA="$(git rev-parse "${HEAD_REF}")"
|
||||
git worktree add --quiet --detach "${HEAD_DIR}" "${HEAD_SHA}"
|
||||
|
||||
echo "Collecting workflows/jobs from ${BASE_REF}..."
|
||||
get_jobs_for_ref "${BASE_DIR}" > "${BASE_JOBS_FILE}" || true
|
||||
|
||||
echo "Collecting workflows/jobs from ${HEAD_REF}..."
|
||||
get_jobs_for_ref "${HEAD_DIR}" > "${HEAD_JOBS_FILE}" || true
|
||||
|
||||
# comm -23: items present in base but missing from head
|
||||
comm -23 "${BASE_JOBS_FILE}" "${HEAD_JOBS_FILE}" > "${DIFF_OUTPUT}"
|
||||
|
||||
if [ -s "${DIFF_OUTPUT}" ]; then
|
||||
echo "------------------------------------------------"
|
||||
echo "DELETED WORKFLOWS DETECTED"
|
||||
echo "The following workflows/jobs exist in ${BASE_REF} but are missing in ${HEAD_REF}:"
|
||||
cat "${DIFF_OUTPUT}"
|
||||
echo "------------------------------------------------"
|
||||
|
||||
if [ "${FAIL_ON_DELETION}" = "true" ]; then
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
echo "No workflows/jobs were deleted in ${HEAD_REF} compared to ${BASE_REF}."
|
||||
rm -f "${DIFF_OUTPUT}"
|
||||
fi
|
||||
25
CHANGELOG.md
Normal file
25
CHANGELOG.md
Normal file
@@ -0,0 +1,25 @@
|
||||
## 0.2.0 (2026-06-03)
|
||||
|
||||
### added (2 changes)
|
||||
|
||||
- [[SIMPL-28408] Added readme.](https://code.europa.eu/simpl/simpl-open/development/data-services/template-code-location/-/commit/5760f9424af425645427598f38a1258c380d999a) ([merge request](https://code.europa.eu/simpl/simpl-open/development/data-services/template-code-location/-/merge_requests/14))
|
||||
- [[SIMPL-27992](https://jira.simplprogramme.eu/browse/SIMPL-27992) Document...](https://code.europa.eu/simpl/simpl-open/development/data-services/template-code-location/-/commit/81bb3299a44fa0c7047c5d485de210fee24e726f) ([merge request](https://code.europa.eu/simpl/simpl-open/development/data-services/template-code-location/-/merge_requests/9))
|
||||
|
||||
### fixed (1 change)
|
||||
|
||||
- [[SIMPL-27884](https://jira.simplprogramm.eu/browse/SIMPL-27884 Fix CHANGELOG.md](https://code.europa.eu/simpl/simpl-open/development/data-services/template-code-location/-/commit/63025e83c880d8cf7aacd552128f94d18fa02e2b) ([merge request](https://code.europa.eu/simpl/simpl-open/development/data-services/template-code-location/-/merge_requests/14))
|
||||
|
||||
### other (1 change)
|
||||
|
||||
- [[SIMPL-28035](https://jira.simplprogramme.eu/browse/SIMPL-28035) Update project version.](https://code.europa.eu/simpl/simpl-open/development/data-services/template-code-location/-/commit/dc0774599dbb8bfed9c20019a822539c79ace61c) ([merge request](https://code.europa.eu/simpl/simpl-open/development/data-services/template-code-location/-/merge_requests/14))
|
||||
|
||||
### changed (2 changes)
|
||||
|
||||
- [[SIMPL-23615](https://jira.simplprogramme.eu/browse/SIMPL-23615) Updated util services](https://code.europa.eu/simpl/simpl-open/development/data-services/template-code-location/-/commit/4723555e31c005cc4925940620abae77a0bcb8b8) ([merge request](https://code.europa.eu/simpl/simpl-open/development/data-services/template-code-location/-/merge_requests/11))
|
||||
- [[SIMPL-27992](https://jira.simplprogramme.eu/browse/SIMPL-27992) Update...](https://code.europa.eu/simpl/simpl-open/development/data-services/template-code-location/-/commit/96a8a8e4ebcfbea301fd27daf798b3737a4e824a) ([merge request](https://code.europa.eu/simpl/simpl-open/development/data-services/template-code-location/-/merge_requests/9))
|
||||
|
||||
### chenged (1 change)
|
||||
|
||||
- [[SIMPL-28035](https://jira.simplprogramme.eu/browse/SIMPL-28035) Document Workflow Update Process.](https://code.europa.eu/simpl/simpl-open/development/data-services/template-code-location/-/commit/1b154aa2d710279047443139c9a3632e945f09c9) ([merge request](https://code.europa.eu/simpl/simpl-open/development/data-services/template-code-location/-/merge_requests/9))
|
||||
|
||||
|
||||
37
Dockerfile
37
Dockerfile
@@ -11,16 +11,9 @@ RUN addgroup --gid 1000 appgroup && \
|
||||
|
||||
# Install system dependencies:
|
||||
# - git: required to fetch util-services from GitLab (tool.uv.sources)
|
||||
# - build-essential / gcc / g++ / python3-dev / cmake: native extensions
|
||||
# (scrubadub-spacy → spaCy, pycanon, etc.)
|
||||
# - curl: optional healthcheck / runtime tooling
|
||||
RUN apt-get update && apt-get upgrade -y \
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
build-essential=12.9 \
|
||||
cmake=3.25.1-1 \
|
||||
gcc=4:12.2.0-3 \
|
||||
g++=4:12.2.0-3 \
|
||||
python3-dev=3.11.2-1+b1 \
|
||||
git=1:2.39.5-0+deb12u3 \
|
||||
curl=7.88.1-10+deb12u14 \
|
||||
&& apt-get clean \
|
||||
@@ -28,41 +21,35 @@ RUN apt-get update && apt-get upgrade -y \
|
||||
&& rm -rf /tmp/* \
|
||||
&& rm -rf /var/tmp/*
|
||||
|
||||
# Pre-own /app so appuser can write to it
|
||||
RUN chown -R appuser:appgroup /app
|
||||
# Ensure appuser can create the project virtual environment in /app
|
||||
RUN chown appuser:appgroup /app
|
||||
|
||||
# Copy project metadata and source
|
||||
COPY pyproject.toml .
|
||||
COPY uv.lock .
|
||||
COPY src/ ./src/
|
||||
COPY --chown=appuser:appgroup pyproject.toml .
|
||||
COPY --chown=appuser:appgroup uv.lock .
|
||||
COPY --chown=appuser:appgroup src/ ./src/
|
||||
|
||||
# uv environment knobs:
|
||||
# UV_COMPILE_BYTECODE → compile .pyc files at install time for faster cold start
|
||||
# UV_COMPILE_BYTECODE → disable .pyc precompile to reduce image size
|
||||
# UV_LINK_MODE=copy → copy files instead of symlinks (required in Docker layers)
|
||||
# UV_SYSTEM_PYTHON=1 → install into the system Python (no extra venv needed)
|
||||
ENV UV_COMPILE_BYTECODE=1
|
||||
ENV UV_COMPILE_BYTECODE=0
|
||||
ENV UV_LINK_MODE=copy
|
||||
ENV UV_SYSTEM_PYTHON=1
|
||||
|
||||
# Install the project and all dependencies, respecting [tool.uv.sources]
|
||||
# (git source for util-services and pytorch-cpu index for torch)
|
||||
# BuildKit cache mount keeps the uv package cache across builds
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv sync --frozen --no-dev
|
||||
USER appuser
|
||||
|
||||
RUN --mount=type=cache,target=/home/appuser/.cache/uv,uid=1000,gid=1000 \
|
||||
uv sync --frozen --no-dev --no-install-package torch
|
||||
|
||||
# Put the project's venv on PATH (matches WORKDIR)
|
||||
ENV PATH="/app/.venv/bin:${PATH}"
|
||||
ENV PYTHONPATH="/app/src"
|
||||
|
||||
# Make /app writable for the non-root user (e.g. spaCy model downloads)
|
||||
RUN chown -R 1000:1000 /app && chmod -R u+w /app
|
||||
|
||||
# Provide a real home directory for appuser
|
||||
RUN mkdir -p /home/appuser && chown -R 1000:1000 /home/appuser
|
||||
ENV HOME=/home/appuser
|
||||
|
||||
USER appuser
|
||||
|
||||
# Sanity-check: fail the build early if the dagster CLI is missing
|
||||
RUN dagster --version
|
||||
|
||||
|
||||
3584
NOTICE.json
Normal file
3584
NOTICE.json
Normal file
File diff suppressed because one or more lines are too long
165
README.md
165
README.md
@@ -0,0 +1,165 @@
|
||||
# Template Code Location
|
||||
|
||||
> Purpose: `template-code-location` is the predefined Application Workflow Template for Simpl-Open providers. It offers a standard Dagster code location that providers can download, extend, and package when building new application workflows, reducing onboarding effort and avoiding configuration drift.
|
||||
|
||||
---
|
||||
|
||||
## Table of Contents
|
||||
|
||||
1. [Overview](#overview)
|
||||
2. [What This Template Includes](#what-this-template-includes)
|
||||
3. [Repository Layout](#repository-layout)
|
||||
4. [Prerequisites](#prerequisites)
|
||||
5. [Quick Start](#quick-start)
|
||||
6. [Development Guide](#development-guide)
|
||||
7. [Testing](#testing)
|
||||
8. [Packaging and Deployment](#packaging-and-deployment)
|
||||
9. [Documentation](#documentation)
|
||||
10. [Contributing](#contributing)
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
This repository provides a standardized baseline for Dagster code locations in the Simpl-Open ecosystem.
|
||||
|
||||
It is intended for providers who need a ready-made starting point for new application workflows. Instead of creating a code location from scratch, a provider can use this template to:
|
||||
|
||||
- start from an agreed repository structure
|
||||
- reuse the standard Dagster packaging model
|
||||
- follow the same conventions for jobs, ops, resources, and definitions
|
||||
- integrate existing Simpl-Open workflow packages as dependencies
|
||||
- add custom workflow logic with minimal setup effort
|
||||
|
||||
The template is intentionally simple. It includes a small example job and ops that demonstrate the expected pattern for creating new workflows, while the main Dagster entrypoint already shows how to register both local jobs and jobs imported from external packages.
|
||||
|
||||
## What This Template Includes
|
||||
|
||||
- A working Dagster code location package under `src/template_code_location`
|
||||
- A repository entrypoint in `src/template_code_location/repository.py`
|
||||
- Example local ops in `src/template_code_location/ops/ops.py`
|
||||
- Example local job in `src/template_code_location/jobs/jobs.py`
|
||||
- Preconfigured dependencies on shared Simpl-Open workflow packages through `pyproject.toml`
|
||||
- A Dockerfile for container packaging
|
||||
- Supporting guidance in the `documents/` folder
|
||||
|
||||
This template is not meant to contain business-specific logic by default. Providers are expected to replace or extend the sample workflow with their own application-specific jobs and operations.
|
||||
|
||||
## Repository Layout
|
||||
|
||||
```text
|
||||
template-code-location/
|
||||
├── src/
|
||||
│ └── template_code_location/
|
||||
│ ├── repository.py # Unified entry point (all jobs/sensors/resources)
|
||||
│ ├── jobs/ # Custom jobs specific to this code location
|
||||
│ │ └── jobs.py
|
||||
│ └── ops/ # Custom ops specific to this code location
|
||||
│ └── ops.py
|
||||
├── tests/ # Unit & integration tests
|
||||
├── Dockerfile
|
||||
├── pyproject.toml # Dependencies & external package sources
|
||||
└── README.md
|
||||
```
|
||||
|
||||
Key files:
|
||||
|
||||
- `src/template_code_location/repository.py`: Dagster `Definitions` entrypoint used to register jobs, sensors, resources, and loggers
|
||||
- `src/template_code_location/jobs/jobs.py`: place for custom workflow definitions
|
||||
- `src/template_code_location/ops/ops.py`: place for custom operational steps used by local jobs
|
||||
- `pyproject.toml`: package metadata and Git-based dependencies on shared Simpl-Open packages
|
||||
- [documents/Development Guide.md](documents/Development%20Guide.md): workflow lifecycle guidance for creating, editing, and deleting workflows
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Before using this template, make sure the following are available:
|
||||
|
||||
- Python 3.12+
|
||||
- `uv` package manager
|
||||
- Access to the Simpl-Open Git repositories referenced by `pyproject.toml`
|
||||
- A local or shared Dagster environment for validation
|
||||
- Docker, if you plan to build the image locally
|
||||
|
||||
Depending on the workflows you build, you may also need access to supporting platform services such as object storage, PostgreSQL, Vault, or Kubernetes.
|
||||
|
||||
## Quick Start
|
||||
|
||||
### 1. Install dependencies
|
||||
|
||||
```powershell
|
||||
uv sync --dev
|
||||
```
|
||||
|
||||
### 2. Start the Dagster code server locally
|
||||
|
||||
```powershell
|
||||
uv run dagster code-server start -h 0.0.0.0 -p 4000 -f src/template_code_location/repository.py
|
||||
```
|
||||
|
||||
This starts the code location using the repository entrypoint already defined in the template.
|
||||
|
||||
### 3. Review the sample workflow
|
||||
|
||||
The template ships with:
|
||||
|
||||
- `fetch_data` and `process_data` ops in `src/template_code_location/ops/ops.py`
|
||||
- `data_processing_job` in `src/template_code_location/jobs/jobs.py`
|
||||
|
||||
These are example building blocks only. Providers should rename, replace, or extend them to match the target application workflow.
|
||||
|
||||
## Development Guide
|
||||
|
||||
Workflow creation, update, deletion, and the expected way to work with external code locations are covered in [documents/Development Guide.md](documents/Development%20Guide.md).
|
||||
|
||||
Use that guide as the primary reference when customizing this template for a provider-specific workflow.
|
||||
|
||||
## Testing
|
||||
|
||||
Custom workflows should always be validated before publication.
|
||||
|
||||
Recommended checks:
|
||||
|
||||
- add unit tests for local jobs and ops
|
||||
- run `pytest` locally after every workflow change
|
||||
- verify that the Dagster code server can load `src/template_code_location/repository.py` without import or definition errors
|
||||
|
||||
Example command:
|
||||
|
||||
```powershell
|
||||
uv run pytest
|
||||
```
|
||||
|
||||
At the moment this template does not include a `tests/` directory, so providers should add one as part of implementing their own workflows.
|
||||
|
||||
## Packaging and Deployment
|
||||
|
||||
This repository is designed to be packaged as a Docker image and promoted through the standard Simpl-Open delivery flow.
|
||||
|
||||
High-level process:
|
||||
|
||||
1. Implement and validate the workflow locally.
|
||||
2. Commit the changes to a feature branch.
|
||||
3. Open a pull request for review.
|
||||
4. Build and publish the container image through CI/CD.
|
||||
5. Update the deployment configuration to use the new image.
|
||||
6. Verify the new code location in Dagster.
|
||||
|
||||
Use immutable image tags such as semantic versions or commit-based identifiers rather than `latest`.
|
||||
|
||||
## Documentation
|
||||
|
||||
Additional guidance is available in the repository documents:
|
||||
|
||||
- [documents/Development Guide.md](documents/Development%20Guide.md): workflow creation, edit, deletion, and deployment process
|
||||
- [documents/Output Separation and Non-Overwrite Principles.md](documents/Output%20Separation%20and%20Non-Overwrite%20Principles.md): output handling expectations for workflow implementations
|
||||
|
||||
## Contributing
|
||||
|
||||
When extending this template:
|
||||
|
||||
- keep the repository structure stable so onboarding stays predictable
|
||||
- preserve the separation between ops, jobs, and repository definitions
|
||||
- avoid hardcoded credentials and environment-specific values
|
||||
- update tests and documentation together with workflow changes
|
||||
|
||||
This repository should remain a reusable baseline for providers, not a dumping ground for unrelated or one-off implementations.
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
## 1. Goal and Scope
|
||||
|
||||
The purpose of this document is to provide a comprehensive guide for participants to create, manage, and update workflows within the Simpl-Open orchestration platform.
|
||||
The purpose of this document is to provide a comprehensive guide for participants to create, manage, update and delete workflows within the Simpl-Open orchestration platform.
|
||||
By following a *code-first approach*, developers ensure consistency, traceability, and reliability across all environments.
|
||||
|
||||
## 2. Local Development
|
||||
@@ -43,7 +43,10 @@ The heavy-lifting logic lives in separate repositories, pulled in as installable
|
||||
These packages expose their jobs and ops which are then imported and registered in `repository.py`.
|
||||
|
||||
### 2.3 Code Examples (Ops, Jobs, and Definitions)
|
||||
The orchestration logic should be modular. Here is a practical example of how to construct a workflow.
|
||||
The orchestration logic should be modular.
|
||||
|
||||
### 2.3.1 Workflow creation
|
||||
Here is a practical example of how to construct a workflow.
|
||||
|
||||
**1. Defining Ops (`ops/ops.py`)**
|
||||
Ops are the core units of computation. Keep them focused on a single task.
|
||||
@@ -100,6 +103,52 @@ defs = Definitions(
|
||||
loggers={"simpl": simpl_json_logger},
|
||||
)
|
||||
```
|
||||
### 2.3.2 Workflow edit
|
||||
|
||||
Updating a workflow, means changing the job name, parameters or definition.
|
||||
Each modification means getting a new version of that workflow. To make the version visible, the name of the job can include a version tag (eg: `data_processing_job_ver_2`).
|
||||
|
||||
**1. Change Jobs (`jobs/jobs.py`)**
|
||||
Change workflow name, parameters or definition in the `jobs.py` file.
|
||||
|
||||
**2. Change Ops (`ops/ops.py`)**
|
||||
If workflow ops are updated, this must be done in `ops.py` file.
|
||||
|
||||
**3. Change Tests (`tests/`)**
|
||||
If jobs or obs are updated, check out their related `tests` and update them accordingly.
|
||||
|
||||
**4. Update Registering Definitions (`repository.py`)**
|
||||
If jobs names are updated, make sure to update the import and definitions.
|
||||
|
||||
### 2.3.3 Workflow deletion
|
||||
Before deleting an existing workflow, first check if is needed or referenced in:
|
||||
- **Asset Orchestrator**
|
||||
- Perform an `GET/workflowDefinitions` api call, to get the workflow details.
|
||||
- **Dagster UI**:
|
||||
- Navigate to the "Runs". Identify a given workflow using the "Filter" button or navigation "Newer", "Older" buttons.
|
||||
- Check details and status: open workflow details by clicking on the target, or just check the status value.
|
||||
- Check logs: open workflow logs by clicking on the uuid.
|
||||
|
||||
Here are two practical examples how to delete a workflow:
|
||||
|
||||
**Temporary delete workflow**
|
||||
|
||||
**1. Clean Registering Definitions (`repository.py`)**
|
||||
Comment or delete workflow definition from the `repository.py`.
|
||||
|
||||
**Permanently delete workflow**
|
||||
|
||||
**1. Clean Jobs (`jobs/jobs.py`)**
|
||||
Delete workflow definition from the `jobs.py` file.
|
||||
|
||||
**2. Clean Ops (`ops/ops.py`)**
|
||||
If workflow links also ops, delete their definition from the `ops.py` file.
|
||||
|
||||
**3. Clean Tests (`tests/`)**
|
||||
If jobs or obs have dedicated unittest, check out the `tests/` folder and delete them.
|
||||
|
||||
**4. Clean Scheduler**
|
||||
If the workflow has any scheduler, check and adjust it accordingly.
|
||||
|
||||
### 2.4 Best Practices & Constraints
|
||||
|
||||
@@ -145,7 +194,9 @@ The deployment follows these automated steps:
|
||||
|
||||
To confirm a successful deployment:
|
||||
|
||||
- **Dagster UI**: Navigate to the "Deployment" or "Code Locations" tab. Verify that the loaded image tag matches the latest Git commit.
|
||||
- **Dagster UI**:
|
||||
- Navigate to the "Deployment" or "Code Locations" tab. Verify that the loaded image tag matches the latest Git commit.
|
||||
- Navigate to the "Runs" tab. Identify deleted workflow. Click on it's target to open details. "Pipeline not found" message shows that workflow was deleted successfully. Click on it's uuid to open logs. Logs must be available after deletion.
|
||||
- **Health Check**: Trigger a "Test Run" of the job in the production environment using a limited data slice.
|
||||
- **Logs**: Monitor the initialization logs in the Dagster daemon to ensure the code location was loaded without schema or dependency errors.
|
||||
|
||||
|
||||
151
documents/Output Separation and Non-Overwrite Principles.md
Normal file
151
documents/Output Separation and Non-Overwrite Principles.md
Normal file
@@ -0,0 +1,151 @@
|
||||
# Dagster Workflow – Input/Output Separation and Non-Overwrite Principles
|
||||
|
||||
## 1. Objective
|
||||
|
||||
The purpose of this document is to describe how Dagster workflows must be configured to ensure that input datasets are **never modified or overwritten** during processing.
|
||||
|
||||
The workflow must:
|
||||
- Read data from a **source location (input)**
|
||||
- Write processed data to a **separate destination (output)**
|
||||
- Preserve the original dataset unchanged
|
||||
|
||||
---
|
||||
|
||||
## 2. Key Principles
|
||||
|
||||
To avoid overwriting input datasets, the following principles must always be applied when designing Dagster workflows:
|
||||
|
||||
### 2.1 Separation of Input and Output
|
||||
- Input and output must **always refer to different storage locations**
|
||||
- This separation can be enforced via:
|
||||
- Different `file_key` values
|
||||
- Different buckets or paths
|
||||
- Different prefixes or folders
|
||||
|
||||
### 2.2 Read-Only Input
|
||||
- Input datasets must be treated as **immutable**
|
||||
- No operation should write back to the input path
|
||||
|
||||
### 2.3 Explicit Output Configuration
|
||||
- Output destinations must be explicitly configured
|
||||
- Avoid default or implicit reuse of input configuration
|
||||
|
||||
### 2.4 Idempotent Processing
|
||||
- Workflow execution should not produce side effects on the original dataset
|
||||
- Re-running the workflow must not alter the source data
|
||||
|
||||
---
|
||||
|
||||
## 3. Test Setup and Execution
|
||||
|
||||
### 3.1 Dataset Configuration
|
||||
- Prepare an input dataset (e.g. `input.csv`) in a defined location:
|
||||
- Example: `s3://dagster-workflow-bucket/input.csv`
|
||||
|
||||
### 3.2 Workflow Configuration
|
||||
- Configure Dagster to:
|
||||
- Read from the input dataset
|
||||
- Write results to a different location (e.g. `output.csv`)
|
||||
|
||||
### 3.3 Execution
|
||||
- Run the Dagster job
|
||||
- Ensure that:
|
||||
- Data is successfully processed
|
||||
- Output dataset is generated
|
||||
|
||||
---
|
||||
|
||||
## 4. Verification of No Overwrite
|
||||
|
||||
To validate correct behavior:
|
||||
|
||||
- Compare input dataset **before and after execution**
|
||||
- Ensure:
|
||||
- File content is unchanged
|
||||
- File timestamp/version is unchanged (if applicable)
|
||||
- Verify that:
|
||||
- Output dataset exists in a different location
|
||||
- Output contains only processed data
|
||||
|
||||
---
|
||||
|
||||
## 5. Example – Simpl-Open Pre-Built Workflow Configuration
|
||||
|
||||
In Simpl-Open pre-built workflows, this principle is **already enforced by design**.
|
||||
|
||||
Below is an example configuration:
|
||||
|
||||
```yaml
|
||||
ops:
|
||||
apply_l_diversity:
|
||||
config:
|
||||
generalisation_hierarchies:
|
||||
age: simpl_age
|
||||
ident:
|
||||
- Name
|
||||
k: 2
|
||||
l: 3
|
||||
quasi_identifiers:
|
||||
- age
|
||||
sensitive_attribute: Disease
|
||||
supp_level: 50.0
|
||||
|
||||
read_structured_from_s3:
|
||||
config:
|
||||
bucket_name: dagster-workflow-bucket
|
||||
file_format: csv
|
||||
file_key: input.csv
|
||||
|
||||
write_df_to_s3:
|
||||
config:
|
||||
bucket_name: dagster-workflow-bucket
|
||||
file_format: csv
|
||||
file_key: output.csv
|
||||
```
|
||||
|
||||
### Explanation
|
||||
|
||||
- `read_structured_from_s3`
|
||||
- Reads the dataset from `input.csv`
|
||||
|
||||
- `write_df_to_s3`
|
||||
- Writes the processed dataset to `output.csv`
|
||||
|
||||
### Key Point
|
||||
|
||||
Even when using the **same bucket**, separation is guaranteed by:
|
||||
- Using a **different `file_key` for output**
|
||||
|
||||
This ensures that:
|
||||
- The input dataset (`input.csv`) is never overwritten
|
||||
- The output dataset is stored independently (`output.csv`)
|
||||
|
||||
---
|
||||
|
||||
## 6. Configuration Guidelines
|
||||
|
||||
When creating or customizing Dagster workflows, follow these guidelines:
|
||||
|
||||
- Always define a **dedicated output path**
|
||||
- Never reuse the same `file_key` for input and output
|
||||
- Prefer:
|
||||
- Different filenames (`input.csv` vs `output.csv`)
|
||||
- Or structured paths:
|
||||
- `/input/...`
|
||||
- `/output/...`
|
||||
- Validate configuration before execution
|
||||
|
||||
---
|
||||
|
||||
## 7. Conclusion
|
||||
|
||||
The separation between input and output datasets is a **design principle** in Dagster workflows.
|
||||
|
||||
Simpl-Open pre-built workflows already implement this approach by:
|
||||
- Clearly distinguishing input and output configurations
|
||||
- Ensuring safe, non-destructive data processing
|
||||
|
||||
Adhering to these principles guarantees:
|
||||
- Data integrity
|
||||
- Reproducibility
|
||||
- Safe pipeline execution without unintended overwrites
|
||||
@@ -1 +1 @@
|
||||
PROJECT_VERSION_NUMBER="0.1.0"
|
||||
PROJECT_VERSION_NUMBER="0.2.0"
|
||||
@@ -4,11 +4,12 @@ build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "template-code-location"
|
||||
version = "0.1.0"
|
||||
version = "0.2.0"
|
||||
description = "Consolidated code location for all data services workflows"
|
||||
requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
"dagster>=1.8.13",
|
||||
"dagster-postgres>=0.24.13",
|
||||
"util-services",
|
||||
"data-processing",
|
||||
"dataframe-level-anonymisation",
|
||||
@@ -16,22 +17,15 @@ dependencies = [
|
||||
]
|
||||
|
||||
[tool.uv]
|
||||
exclude-dependencies = ["transformers", "spacy-transformers"]
|
||||
override-dependencies = [
|
||||
"util-services @ git+https://code.europa.eu/simpl/simpl-open/development/data-services/util-services.git@v0.6.1",
|
||||
"util-services @ git+https://code.europa.eu/simpl/simpl-open/development/data-services/util-services.git@v0.7.0",
|
||||
]
|
||||
|
||||
[tool.uv.sources]
|
||||
torch = { index = "pytorch-cpu" }
|
||||
util-services = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/util-services.git", rev = "v0.6.1" }
|
||||
data-processing = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/data-processing.git", branch = "0.4.0" }
|
||||
dataframe-level-anonymisation = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/dataframe-level-anonymisation.git", branch = "0.6.0" }
|
||||
field-level-pseudo-anonymisation = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/field-level-pseudo-anonymisation.git", branch = "0.7.0" }
|
||||
|
||||
[[tool.uv.index]]
|
||||
name = "pytorch-cpu"
|
||||
url = "https://download.pytorch.org/whl/cpu"
|
||||
explicit = true
|
||||
util-services = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/util-services.git", rev = "v0.7.0" }
|
||||
data-processing = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/data-processing.git", rev = "v0.4.0" }
|
||||
dataframe-level-anonymisation = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/dataframe-level-anonymisation.git", rev = "v0.6.0" }
|
||||
field-level-pseudo-anonymisation = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/field-level-pseudo-anonymisation.git", rev = "v0.7.0" }
|
||||
|
||||
[project.optional-dependencies]
|
||||
dev = [
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
from dagster import job
|
||||
from ..ops.ops import fetch_data, process_data
|
||||
from dagster import ScheduleDefinition, job
|
||||
from util_services.util_ops import preview_dataframe
|
||||
|
||||
from ..ops.ops import (fetch_data, generate_example_dataset, process_data,
|
||||
transform_example_dataset)
|
||||
|
||||
|
||||
@job
|
||||
@@ -7,3 +10,23 @@ def data_processing_job():
|
||||
"""A simple job that fetches and processes data."""
|
||||
raw = fetch_data()
|
||||
process_data(raw)
|
||||
|
||||
data_processing_job_daily_9am = ScheduleDefinition(
|
||||
job=data_processing_job,
|
||||
cron_schedule="0 9 * * *",
|
||||
)
|
||||
|
||||
@job
|
||||
def example_job():
|
||||
original_df = generate_example_dataset()
|
||||
preview_dataframe(original_df)
|
||||
|
||||
@job
|
||||
def example_dataframe_demo_job():
|
||||
"""Demonstrates generating and transforming a pandas DataFrame with previews."""
|
||||
original_df = generate_example_dataset()
|
||||
preview_dataframe(original_df)
|
||||
transformed_df = transform_example_dataset(original_df)
|
||||
preview_dataframe(transformed_df)
|
||||
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import pandas as pd
|
||||
from dagster import op
|
||||
|
||||
|
||||
@@ -11,3 +12,29 @@ def fetch_data() -> list:
|
||||
def process_data(data: list) -> dict:
|
||||
"""Processes raw data and returns a summary."""
|
||||
return {"count": len(data), "status": "success"}
|
||||
|
||||
|
||||
@op
|
||||
def generate_example_dataset() -> pd.DataFrame:
|
||||
"""Generates a small example dataset as a pandas DataFrame."""
|
||||
return pd.DataFrame(
|
||||
[
|
||||
{"order_id": 1001, "customer": "Alice", "quantity": 2, "unit_price": 9.50},
|
||||
{"order_id": 1002, "customer": "Bob", "quantity": 1, "unit_price": 15.00},
|
||||
{"order_id": 1003, "customer": "Charlie", "quantity": 4, "unit_price": 7.25},
|
||||
{"order_id": 1004, "customer": "Dora", "quantity": 3, "unit_price": 11.00},
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@op
|
||||
def transform_example_dataset(df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Applies a simple transformation to the example dataset."""
|
||||
|
||||
df["total_price"] = df["quantity"] * df["unit_price"]
|
||||
|
||||
df["price_band"] = df["total_price"].apply(
|
||||
lambda value: "high" if value >= 25 else "standard"
|
||||
)
|
||||
|
||||
return df
|
||||
|
||||
@@ -1,70 +1,25 @@
|
||||
from dagster import Definitions
|
||||
from util_services.resources import s3_resource
|
||||
from util_services.sensors import (
|
||||
notify_success,
|
||||
notify_failure,
|
||||
notify_canceled
|
||||
)
|
||||
from util_services.custom_json_logger import simpl_json_logger
|
||||
|
||||
# Data processing jobs
|
||||
from data_processing.jobs import (
|
||||
remove_duplicates_job_s3,
|
||||
fill_missing_values_job_s3,
|
||||
standardize_categorical_values_job_s3,
|
||||
correct_typos_job_s3,
|
||||
normalize_numeric_min_max_job_s3,
|
||||
normalize_datetime_job_s3,
|
||||
normalize_coordinates_job_s3,
|
||||
add_global_aggregations_job_s3,
|
||||
filter_dataset_job_s3,
|
||||
quality_job_s3
|
||||
)
|
||||
|
||||
# Dataframe-level anonymisation jobs
|
||||
from dataframe_level_anonymisation.jobs import (
|
||||
k_anonymity_job_s3,
|
||||
from dataframe_level_anonymisation.jobs import (k_anonymity_job_s3,
|
||||
l_diversity_job_s3,
|
||||
t_closeness_job_s3,
|
||||
read_write_semistructured_job_s3,
|
||||
)
|
||||
t_closeness_job_s3)
|
||||
from template_code_location.jobs.jobs import (data_processing_job,
|
||||
data_processing_job_daily_9am,
|
||||
example_dataframe_demo_job,
|
||||
example_job)
|
||||
from util_services.custom_json_logger import simpl_json_logger
|
||||
from util_services.resources import s3_resource
|
||||
|
||||
# Field-level pseudo-anonymisation jobs
|
||||
from field_level_pseudo_anonymisation.jobs import (
|
||||
anonymise_pseudonymise_structured_job_s3,
|
||||
depseudonymise_structured_job_s3,
|
||||
anonymise_pseudonymise_unstructured_job_s3,
|
||||
depseudonymise_unstructured_job_s3,
|
||||
)
|
||||
|
||||
from template_code_location.jobs.jobs import data_processing_job
|
||||
|
||||
defs = Definitions(
|
||||
jobs=[
|
||||
data_processing_job,
|
||||
# Data processing
|
||||
remove_duplicates_job_s3,
|
||||
fill_missing_values_job_s3,
|
||||
standardize_categorical_values_job_s3,
|
||||
correct_typos_job_s3,
|
||||
normalize_numeric_min_max_job_s3,
|
||||
normalize_datetime_job_s3,
|
||||
normalize_coordinates_job_s3,
|
||||
add_global_aggregations_job_s3,
|
||||
filter_dataset_job_s3,
|
||||
quality_job_s3,
|
||||
# Dataframe-level anonymisation
|
||||
example_dataframe_demo_job,
|
||||
k_anonymity_job_s3,
|
||||
l_diversity_job_s3,
|
||||
t_closeness_job_s3,
|
||||
read_write_semistructured_job_s3,
|
||||
# Field-level pseudo-anonymisation
|
||||
anonymise_pseudonymise_structured_job_s3,
|
||||
depseudonymise_structured_job_s3,
|
||||
anonymise_pseudonymise_unstructured_job_s3,
|
||||
depseudonymise_unstructured_job_s3,
|
||||
],
|
||||
sensors=[notify_success, notify_failure, notify_canceled],
|
||||
schedules=[data_processing_job_daily_9am],
|
||||
sensors=[],
|
||||
resources={"s3": s3_resource.configured({"resource_name": "selfS3"})},
|
||||
loggers={"simpl": simpl_json_logger},
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user