1 files changed, 179 insertions, 0 deletions
diff --git a/tools/check_language.py b/tools/check_language.py
new file mode 100755
index 0000000..b7ca528
--- /dev/null
+++ b/tools/check_language.py
@@ -0,0 +1,179 @@
+#!/usr/bin/env python
+
+# Copyright 2020 The Amber Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#	http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Script to check files for inclusive language. The script will scan all files
+and flag non-inclusive terminology which is identified.
+
+Usage, run the script from a folder and the script will scan down through that
+folder.
+"""
+
+import fnmatch
+import os
+import re
+import sys
+
+REGEXES = [
+	r"(?i)black[-_]?list",
+	r"(?i)white[-_]?list",
+	r"(?i)gr[ea]y[-_]?list",
+	r"(?i)(first class citizen)",
+	r"(?i)black[-_]?hat",
+	r"(?i)white[-_]?hat",
+	r"(?i)gr[ea]y[-_]?hat",
+	r"(?i)master",
+	r"(?i)slave",
+	r"(?i)\bhim\b",
+	r"(?i)\bhis\b",
+	r"(?i)\bshe\b",
+	r"(?i)\bher\b",
+	r"(?i)\bhers\b",
+	r"(?i)\bman\b",
+	r"(?i)\bwoman\b",
+	r"(?i)\she\s",
+	r"(?i)\she$",
+	r"(?i)^he\s",
+	r"(?i)^he$",
+	r"(?i)\she['|\u2019]d\s",
+	r"(?i)\she['|\u2019]d$",
+	r"(?i)^he['|\u2019]d\s",
+	r"(?i)^he['|\u2019]d$",
+	r"(?i)\she['|\u2019]s\s",
+	r"(?i)\she['|\u2019]s$",
+	r"(?i)^he['|\u2019]s\s",
+	r"(?i)^he['|\u2019]s$",
+	r"(?i)\she['|\u2019]ll\s",
+	r"(?i)\she['|\u2019]ll$",
+	r"(?i)^he['|\u2019]ll\s",
+	r"(?i)^he['|\u2019]ll$",
+	r"(?i)grandfather",
+	r"(?i)\bmitm\b",
+	r"(?i)\bcrazy\b",
+	r"(?i)\binsane\b",
+	r"(?i)\bblind\sto\b",
+	r"(?i)\bflying\sblind\b",
+	r"(?i)\bblind\seye\b",
+	r"(?i)\bcripple\b",
+	r"(?i)\bcrippled\b",
+	r"(?i)\bdumb\b",
+	r"(?i)\bdummy\b",
+	r"(?i)\bparanoid\b",
+	r"(?i)\bsane\b",
+	r"(?i)\bsanity\b",
+	r"(?i)red[-_]?line",
+]
+
+SUPPRESSIONS = [
+	r"(?i)MS_SLAVE",
+	r"(?i)man[ -_]?page",
+]
+
+
+REGEX_LIST = []
+for reg in REGEXES:
+	REGEX_LIST.append(re.compile(reg))
+
+SUPPRESSION_LIST = []
+for supp in SUPPRESSIONS:
+	SUPPRESSION_LIST.append(re.compile(supp))
+
+def find(top, filename_glob, skip_glob_list):
+	"""Returns files in the tree rooted at top matching filename_glob but not
+	in directories matching skip_glob_list."""
+
+	file_list = []
+	for path, dirs, files in os.walk(top):
+		for glob in skip_glob_list:
+			for match in fnmatch.filter(dirs, glob):
+				dirs.remove(match)
+		for filename in fnmatch.filter(files, filename_glob):
+			if filename == os.path.basename(__file__):
+				continue
+			file_list.append(os.path.join(path, filename))
+	return file_list
+
+
+def filtered_descendants(glob):
+	"""Returns glob-matching filenames under the current directory, but skips
+	some irrelevant paths."""
+	return find('.', glob, ['third_party', 'external', 'build*', 'out*',
+							'CompilerIdCXX', '.git'])
+
+def check_match(filename, contents):
+	"""Check if contents contains any matching entries"""
+	ret = False
+	for reg in REGEX_LIST:
+		match = reg.search(contents)
+		if match:
+			suppressed = False
+			for supp in SUPPRESSION_LIST:
+				idx = match.start()
+				supp_match = supp.match(contents[idx:])
+				if supp_match:
+					suppressed = True
+
+				# This is a hack to handle the MS_ prefix that is needed
+				# to check for. Find a better way if we get more suppressions
+				# which modify the prefix of the string
+				if idx >= 3:
+					supp_match = supp.match(contents[idx - 3:])
+					if supp_match:
+						suppressed = True
+
+			if not suppressed:
+				# No matching suppression.
+				print("{}: found non-inclusive language: {}".format(
+						filename, match.group(0)))
+				ret = True
+
+	return ret
+
+
+def alert_if_lang_matches(glob):
+	"""Prints names of all files matching non-inclusive language.
+
+	Finds all glob-matching files under the current directory and checks if they
+	contain the language pattern.  Prints the names of all the files that
+	match.
+
+	Returns the total number of file names printed.
+	"""
+	verbose = False
+	printed_count = 0
+	for file in filtered_descendants(glob):
+		has_match = False
+		try:
+			with open(file, 'r', encoding='utf8') as contents:
+				if check_match(file, contents.read()):
+					printed_count += 1
+		except:
+			if verbose:
+				print("skipping {}".format(file))
+
+	return printed_count
+
+
+def main():
+	globs = ['*']
+	count = 0
+	for glob in globs:
+		count += alert_if_lang_matches(glob)
+
+	sys.exit(count > 0)
+
+if __name__ == '__main__':
+	main()