diff options
author | Sergii Tkachenko <sergiitk@google.com> | 2023-10-16 15:44:51 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-10-16 15:44:51 -0700 |
commit | 1c4da38d40d961e5841b2a7e8a5821fb25b4f593 (patch) | |
tree | af1653375a6edeed90e462339d326776b5db4edc /tools/internal_ci | |
parent | 997c73a6a47ef8aa5ef9c8814ecd783b13f71828 (diff) | |
download | grpc-grpc-1c4da38d40d961e5841b2a7e8a5821fb25b4f593.tar.gz |
[PSM Interop] New cleanup script (#33460)
1. Changes the resource retention period to 2 days for all resources
(previously 7 days for TD resources, 6 hours for k8s). This solved a
problem with k8s resources being stuck because corresponding TD
resources weren't deleted.
2. Resume on namespace cleanup failures
3. Add secondary lb cluster cleanup logic
4. Modularize `grpc_xds_resource_cleanup.sh`
5. Make `KubernetesNamespace`'s methods `pretty_format_status` and
`pretty_format_metadata` public
6. `pretty_format_status`: also print resource kind, creation and
deletion requested dates
ref b/259724370, cl/517235715
Diffstat (limited to 'tools/internal_ci')
-rw-r--r-- | tools/internal_ci/linux/grpc_xds_resource_cleanup.sh | 173 |
1 files changed, 124 insertions, 49 deletions
diff --git a/tools/internal_ci/linux/grpc_xds_resource_cleanup.sh b/tools/internal_ci/linux/grpc_xds_resource_cleanup.sh index 22b660ccdf..ad212f505a 100644 --- a/tools/internal_ci/linux/grpc_xds_resource_cleanup.sh +++ b/tools/internal_ci/linux/grpc_xds_resource_cleanup.sh @@ -13,61 +13,136 @@ # See the License for the specific language governing permissions and # limitations under the License. -set -ex +set -eo pipefail -# consts +# Constants readonly GITHUB_REPOSITORY_NAME="grpc" readonly TEST_DRIVER_INSTALL_SCRIPT_URL="https://raw.githubusercontent.com/${TEST_DRIVER_REPO_OWNER:-grpc}/grpc/${TEST_DRIVER_BRANCH:-master}/tools/internal_ci/linux/grpc_xds_k8s_install_test_driver.sh" +# Keep orphaned resources last 2 days. +readonly KEEP_HOURS="${KEEP_HOURS:-48}" -cd "$(dirname "$0")/../../.." +cleanup::activate_cluster() { + activate_gke_cluster "$1" + gcloud container clusters get-credentials "${GKE_CLUSTER_NAME}" \ + --zone "${GKE_CLUSTER_ZONE}" + CLEANUP_KUBE_CONTEXT="$(kubectl config current-context)" +} -# Source the test driver from the master branch. -echo "Sourcing test driver install script from: ${TEST_DRIVER_INSTALL_SCRIPT_URL}" -source /dev/stdin <<< "$(curl -s "${TEST_DRIVER_INSTALL_SCRIPT_URL}")" -activate_gke_cluster GKE_CLUSTER_PSM_SECURITY -kokoro_setup_test_driver "${GITHUB_REPOSITORY_NAME}" +cleanup::activate_secondary_cluster_as_primary() { + activate_secondary_gke_cluster "$1" + GKE_CLUSTER_NAME="${SECONDARY_GKE_CLUSTER_NAME}" + GKE_CLUSTER_ZONE="${SECONDARY_GKE_CLUSTER_ZONE}" + gcloud container clusters get-credentials "${GKE_CLUSTER_NAME}" \ + --zone "${GKE_CLUSTER_ZONE}" + CLEANUP_KUBE_CONTEXT="$(kubectl config current-context)" +} -cd "${TEST_DRIVER_FULL_DIR}" +cleanup::job::cleanup_td() { + cleanup::run_clean "$1" --mode=td +} -# flag resource_prefix is required by the gke test framework, but doesn't -# matter for the cleanup script. -python3 -m bin.cleanup.cleanup \ - --project=grpc-testing \ - --network=default-vpc \ - --kube_context="${KUBE_CONTEXT}" \ - --gcp_service_account=xds-k8s-interop-tests@grpc-testing.iam.gserviceaccount.com \ - --resource_prefix='required-but-does-not-matter' \ - --td_bootstrap_image='required-but-does-not-matter' --server_image='required-but-does-not-matter' --client_image='required-but-does-not-matter' +####################################### +# The PSM_LB cluster is used by k8s_lb tests. +# The keep hours is reduced to 6. +####################################### +cleanup::job::cleanup_cluster_lb_primary() { + cleanup::activate_cluster GKE_CLUSTER_PSM_LB + cleanup::run_clean "$1" --mode=k8s +} -# The BASIC cluster is used by url-map tests. Only cleaning the GKE client -# namespaces, which won't provide much value in debugging. The keep hours is -# reduced to 6. -activate_gke_cluster GKE_CLUSTER_PSM_BASIC -# Invoking the get-crednetials directly, because the -# gcloud_get_cluster_credentials re-sets readonly Bash variables, which is nice -# safety mechanism to keep. -gcloud container clusters get-credentials "${GKE_CLUSTER_NAME}" --zone "${GKE_CLUSTER_ZONE}" -TARGET_KUBE_CONTEXT="$(kubectl config current-context)" -python3 -m bin.cleanup.namespace \ - --project=grpc-testing \ - --network=default-vpc \ - --keep_hours=6 \ - --kube_context="${TARGET_KUBE_CONTEXT}" \ - --gcp_service_account=xds-k8s-interop-tests@grpc-testing.iam.gserviceaccount.com \ - --resource_prefix='required-but-does-not-matter' \ - --td_bootstrap_image='required-but-does-not-matter' --server_image='required-but-does-not-matter' --client_image='required-but-does-not-matter' +####################################### +# Secondary PSM_LB cluster is used by k8s_lb tests. +# The keep hours is reduced to 6. +####################################### +cleanup::job::cleanup_cluster_lb_secondary() { + cleanup::activate_secondary_cluster_as_primary GKE_CLUSTER_PSM_LB + cleanup::run_clean "$1" --mode=k8s --secondary +} -# The PSM_LB cluster is used by k8s_lb tests. Only cleaning the GKE client -# namespaces, which won't provide much value in debugging. The keep hours is -# reduced to 6. -activate_gke_cluster GKE_CLUSTER_PSM_LB -gcloud container clusters get-credentials "${GKE_CLUSTER_NAME}" --zone "${GKE_CLUSTER_ZONE}" -TARGET_KUBE_CONTEXT="$(kubectl config current-context)" -python3 -m bin.cleanup.namespace \ - --project=grpc-testing \ - --network=default-vpc \ - --keep_hours=6 \ - --kube_context="${TARGET_KUBE_CONTEXT}" \ - --gcp_service_account=xds-k8s-interop-tests@grpc-testing.iam.gserviceaccount.com \ - --resource_prefix='required-but-does-not-matter' \ - --td_bootstrap_image='required-but-does-not-matter' --server_image='required-but-does-not-matter' --client_image='required-but-does-not-matter' +####################################### +# The BASIC cluster is used by url-map tests. Only cleaning the xds client +# namespaces; the xds server namespaces are shared. +# The keep hours is reduced to 6. +####################################### +cleanup::job::cleanup_cluster_url_map() { + cleanup::activate_cluster GKE_CLUSTER_PSM_BASIC + cleanup::run_clean "$1" --mode=k8s +} + +####################################### +# The SECURITY cluster is used by the security and authz test suites. +####################################### +cleanup::job::cleanup_cluster_security() { + cleanup::activate_cluster GKE_CLUSTER_PSM_SECURITY + cleanup::run_clean "$1" --mode=k8s +} + +####################################### +# Set common variables for the cleanup script. +# Globals: +# TEST_DRIVER_FLAGFILE: Relative path to test driver flagfile +# TEST_XML_OUTPUT_DIR: Output directory for the test xUnit XML report +# CLEANUP_KUBE_CONTEXT: The name of kubectl context with GKE cluster access. +# Arguments: +# Test job name. Currently only used to generate asset path, and uses +# values from the cleanup_jobs array of main(). +# TODO(sergiitk): turn job_name into action test methods of the cleanup. +# Outputs: +# Writes the output of test execution to stdout, stderr, +# ${TEST_XML_OUTPUT_DIR}/${job_name}/sponge_log.log +####################################### +cleanup::run_clean() { + local job_name="${1:?Usage: cleanup::run_clean job_name}" + local out_dir="${TEST_XML_OUTPUT_DIR}/${job_name}" + mkdir -pv "${out_dir}" + # TODO(sergiitk): make it a test, where job_name is a separate method. + python3 -m bin.cleanup.cleanup \ + --flagfile="${TEST_DRIVER_FLAGFILE}" \ + --kube_context="${CLEANUP_KUBE_CONTEXT:-unset}" \ + --keep_hours="${KEEP_HOURS}" \ + "${@:2}" \ + |& tee "${out_dir}/sponge_log.log" +} + +####################################### +# Main function: provision software necessary to execute the cleanup tasks; +# run them, and report the status. +####################################### +main() { + local script_dir + script_dir="$(dirname "$0")" + + # Source the test captured from the master branch. + echo "Sourcing test driver install captured from: ${TEST_DRIVER_INSTALL_SCRIPT_URL}" + source /dev/stdin <<< "$(curl -s "${TEST_DRIVER_INSTALL_SCRIPT_URL}")" + set +x + + # Valid cluster variables needed for the automatic driver setup. + activate_gke_cluster GKE_CLUSTER_PSM_BASIC + kokoro_setup_test_driver "${GITHUB_REPOSITORY_NAME}" + + # Run tests + cd "${TEST_DRIVER_FULL_DIR}" + local failed_jobs=0 + declare -a cleanup_jobs + cleanup_jobs=( + "cleanup_td" + "cleanup_cluster_lb_primary" + "cleanup_cluster_lb_secondary" + "cleanup_cluster_security" + "cleanup_cluster_url_map" + ) + for job_name in "${cleanup_jobs[@]}"; do + echo "-------------------- Starting job ${job_name} --------------------" + set -x + "cleanup::job::${job_name}" "${job_name}" || (( ++failed_jobs )) + set +x + echo "-------------------- Finished job ${job_name} --------------------" + done + echo "Failed job suites: ${failed_jobs}" + if (( failed_jobs > 0 )); then + exit 1 + fi +} + +main "$@" |