aboutsummaryrefslogtreecommitdiff
path: root/tools/internal_ci
diff options
context:
space:
mode:
authorSergii Tkachenko <sergiitk@google.com>2023-10-16 15:44:51 -0700
committerGitHub <noreply@github.com>2023-10-16 15:44:51 -0700
commit1c4da38d40d961e5841b2a7e8a5821fb25b4f593 (patch)
treeaf1653375a6edeed90e462339d326776b5db4edc /tools/internal_ci
parent997c73a6a47ef8aa5ef9c8814ecd783b13f71828 (diff)
downloadgrpc-grpc-1c4da38d40d961e5841b2a7e8a5821fb25b4f593.tar.gz
[PSM Interop] New cleanup script (#33460)
1. Changes the resource retention period to 2 days for all resources (previously 7 days for TD resources, 6 hours for k8s). This solved a problem with k8s resources being stuck because corresponding TD resources weren't deleted. 2. Resume on namespace cleanup failures 3. Add secondary lb cluster cleanup logic 4. Modularize `grpc_xds_resource_cleanup.sh` 5. Make `KubernetesNamespace`'s methods `pretty_format_status` and `pretty_format_metadata` public 6. `pretty_format_status`: also print resource kind, creation and deletion requested dates ref b/259724370, cl/517235715
Diffstat (limited to 'tools/internal_ci')
-rw-r--r--tools/internal_ci/linux/grpc_xds_resource_cleanup.sh173
1 files changed, 124 insertions, 49 deletions
diff --git a/tools/internal_ci/linux/grpc_xds_resource_cleanup.sh b/tools/internal_ci/linux/grpc_xds_resource_cleanup.sh
index 22b660ccdf..ad212f505a 100644
--- a/tools/internal_ci/linux/grpc_xds_resource_cleanup.sh
+++ b/tools/internal_ci/linux/grpc_xds_resource_cleanup.sh
@@ -13,61 +13,136 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-set -ex
+set -eo pipefail
-# consts
+# Constants
readonly GITHUB_REPOSITORY_NAME="grpc"
readonly TEST_DRIVER_INSTALL_SCRIPT_URL="https://raw.githubusercontent.com/${TEST_DRIVER_REPO_OWNER:-grpc}/grpc/${TEST_DRIVER_BRANCH:-master}/tools/internal_ci/linux/grpc_xds_k8s_install_test_driver.sh"
+# Keep orphaned resources last 2 days.
+readonly KEEP_HOURS="${KEEP_HOURS:-48}"
-cd "$(dirname "$0")/../../.."
+cleanup::activate_cluster() {
+ activate_gke_cluster "$1"
+ gcloud container clusters get-credentials "${GKE_CLUSTER_NAME}" \
+ --zone "${GKE_CLUSTER_ZONE}"
+ CLEANUP_KUBE_CONTEXT="$(kubectl config current-context)"
+}
-# Source the test driver from the master branch.
-echo "Sourcing test driver install script from: ${TEST_DRIVER_INSTALL_SCRIPT_URL}"
-source /dev/stdin <<< "$(curl -s "${TEST_DRIVER_INSTALL_SCRIPT_URL}")"
-activate_gke_cluster GKE_CLUSTER_PSM_SECURITY
-kokoro_setup_test_driver "${GITHUB_REPOSITORY_NAME}"
+cleanup::activate_secondary_cluster_as_primary() {
+ activate_secondary_gke_cluster "$1"
+ GKE_CLUSTER_NAME="${SECONDARY_GKE_CLUSTER_NAME}"
+ GKE_CLUSTER_ZONE="${SECONDARY_GKE_CLUSTER_ZONE}"
+ gcloud container clusters get-credentials "${GKE_CLUSTER_NAME}" \
+ --zone "${GKE_CLUSTER_ZONE}"
+ CLEANUP_KUBE_CONTEXT="$(kubectl config current-context)"
+}
-cd "${TEST_DRIVER_FULL_DIR}"
+cleanup::job::cleanup_td() {
+ cleanup::run_clean "$1" --mode=td
+}
-# flag resource_prefix is required by the gke test framework, but doesn't
-# matter for the cleanup script.
-python3 -m bin.cleanup.cleanup \
- --project=grpc-testing \
- --network=default-vpc \
- --kube_context="${KUBE_CONTEXT}" \
- --gcp_service_account=xds-k8s-interop-tests@grpc-testing.iam.gserviceaccount.com \
- --resource_prefix='required-but-does-not-matter' \
- --td_bootstrap_image='required-but-does-not-matter' --server_image='required-but-does-not-matter' --client_image='required-but-does-not-matter'
+#######################################
+# The PSM_LB cluster is used by k8s_lb tests.
+# The keep hours is reduced to 6.
+#######################################
+cleanup::job::cleanup_cluster_lb_primary() {
+ cleanup::activate_cluster GKE_CLUSTER_PSM_LB
+ cleanup::run_clean "$1" --mode=k8s
+}
-# The BASIC cluster is used by url-map tests. Only cleaning the GKE client
-# namespaces, which won't provide much value in debugging. The keep hours is
-# reduced to 6.
-activate_gke_cluster GKE_CLUSTER_PSM_BASIC
-# Invoking the get-crednetials directly, because the
-# gcloud_get_cluster_credentials re-sets readonly Bash variables, which is nice
-# safety mechanism to keep.
-gcloud container clusters get-credentials "${GKE_CLUSTER_NAME}" --zone "${GKE_CLUSTER_ZONE}"
-TARGET_KUBE_CONTEXT="$(kubectl config current-context)"
-python3 -m bin.cleanup.namespace \
- --project=grpc-testing \
- --network=default-vpc \
- --keep_hours=6 \
- --kube_context="${TARGET_KUBE_CONTEXT}" \
- --gcp_service_account=xds-k8s-interop-tests@grpc-testing.iam.gserviceaccount.com \
- --resource_prefix='required-but-does-not-matter' \
- --td_bootstrap_image='required-but-does-not-matter' --server_image='required-but-does-not-matter' --client_image='required-but-does-not-matter'
+#######################################
+# Secondary PSM_LB cluster is used by k8s_lb tests.
+# The keep hours is reduced to 6.
+#######################################
+cleanup::job::cleanup_cluster_lb_secondary() {
+ cleanup::activate_secondary_cluster_as_primary GKE_CLUSTER_PSM_LB
+ cleanup::run_clean "$1" --mode=k8s --secondary
+}
-# The PSM_LB cluster is used by k8s_lb tests. Only cleaning the GKE client
-# namespaces, which won't provide much value in debugging. The keep hours is
-# reduced to 6.
-activate_gke_cluster GKE_CLUSTER_PSM_LB
-gcloud container clusters get-credentials "${GKE_CLUSTER_NAME}" --zone "${GKE_CLUSTER_ZONE}"
-TARGET_KUBE_CONTEXT="$(kubectl config current-context)"
-python3 -m bin.cleanup.namespace \
- --project=grpc-testing \
- --network=default-vpc \
- --keep_hours=6 \
- --kube_context="${TARGET_KUBE_CONTEXT}" \
- --gcp_service_account=xds-k8s-interop-tests@grpc-testing.iam.gserviceaccount.com \
- --resource_prefix='required-but-does-not-matter' \
- --td_bootstrap_image='required-but-does-not-matter' --server_image='required-but-does-not-matter' --client_image='required-but-does-not-matter'
+#######################################
+# The BASIC cluster is used by url-map tests. Only cleaning the xds client
+# namespaces; the xds server namespaces are shared.
+# The keep hours is reduced to 6.
+#######################################
+cleanup::job::cleanup_cluster_url_map() {
+ cleanup::activate_cluster GKE_CLUSTER_PSM_BASIC
+ cleanup::run_clean "$1" --mode=k8s
+}
+
+#######################################
+# The SECURITY cluster is used by the security and authz test suites.
+#######################################
+cleanup::job::cleanup_cluster_security() {
+ cleanup::activate_cluster GKE_CLUSTER_PSM_SECURITY
+ cleanup::run_clean "$1" --mode=k8s
+}
+
+#######################################
+# Set common variables for the cleanup script.
+# Globals:
+# TEST_DRIVER_FLAGFILE: Relative path to test driver flagfile
+# TEST_XML_OUTPUT_DIR: Output directory for the test xUnit XML report
+# CLEANUP_KUBE_CONTEXT: The name of kubectl context with GKE cluster access.
+# Arguments:
+# Test job name. Currently only used to generate asset path, and uses
+# values from the cleanup_jobs array of main().
+# TODO(sergiitk): turn job_name into action test methods of the cleanup.
+# Outputs:
+# Writes the output of test execution to stdout, stderr,
+# ${TEST_XML_OUTPUT_DIR}/${job_name}/sponge_log.log
+#######################################
+cleanup::run_clean() {
+ local job_name="${1:?Usage: cleanup::run_clean job_name}"
+ local out_dir="${TEST_XML_OUTPUT_DIR}/${job_name}"
+ mkdir -pv "${out_dir}"
+ # TODO(sergiitk): make it a test, where job_name is a separate method.
+ python3 -m bin.cleanup.cleanup \
+ --flagfile="${TEST_DRIVER_FLAGFILE}" \
+ --kube_context="${CLEANUP_KUBE_CONTEXT:-unset}" \
+ --keep_hours="${KEEP_HOURS}" \
+ "${@:2}" \
+ |& tee "${out_dir}/sponge_log.log"
+}
+
+#######################################
+# Main function: provision software necessary to execute the cleanup tasks;
+# run them, and report the status.
+#######################################
+main() {
+ local script_dir
+ script_dir="$(dirname "$0")"
+
+ # Source the test captured from the master branch.
+ echo "Sourcing test driver install captured from: ${TEST_DRIVER_INSTALL_SCRIPT_URL}"
+ source /dev/stdin <<< "$(curl -s "${TEST_DRIVER_INSTALL_SCRIPT_URL}")"
+ set +x
+
+ # Valid cluster variables needed for the automatic driver setup.
+ activate_gke_cluster GKE_CLUSTER_PSM_BASIC
+ kokoro_setup_test_driver "${GITHUB_REPOSITORY_NAME}"
+
+ # Run tests
+ cd "${TEST_DRIVER_FULL_DIR}"
+ local failed_jobs=0
+ declare -a cleanup_jobs
+ cleanup_jobs=(
+ "cleanup_td"
+ "cleanup_cluster_lb_primary"
+ "cleanup_cluster_lb_secondary"
+ "cleanup_cluster_security"
+ "cleanup_cluster_url_map"
+ )
+ for job_name in "${cleanup_jobs[@]}"; do
+ echo "-------------------- Starting job ${job_name} --------------------"
+ set -x
+ "cleanup::job::${job_name}" "${job_name}" || (( ++failed_jobs ))
+ set +x
+ echo "-------------------- Finished job ${job_name} --------------------"
+ done
+ echo "Failed job suites: ${failed_jobs}"
+ if (( failed_jobs > 0 )); then
+ exit 1
+ fi
+}
+
+main "$@"