aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordan sinclair <dsinclair@google.com>2020-08-13 09:48:28 -0400
committerGitHub <noreply@github.com>2020-08-13 09:48:28 -0400
commit88f78401e9af26f1249944b942ddf5dd706572e8 (patch)
tree8c1346ef55a3ef92aa57cba44165bcd014357109
parente5717280728970d6317ed896d1ae14acf123ebfc (diff)
downloadamber-88f78401e9af26f1249944b942ddf5dd706572e8.tar.gz
Simple script to check for inclusive language. (#897)
This CL adds a simple script to check for inclusive language. All files in the repo are scanned and any matches are output.
-rw-r--r--.gitignore2
-rwxr-xr-xtools/check_language.py179
-rwxr-xr-xtools/check_language_test.py61
3 files changed, 242 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
index 55f5cce..d5f3eb0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,6 +19,8 @@ third_party/vulkan-loader
third_party/vulkan-validationlayers/
.vs
+*.pyc
+
# Vim swap files
[._]*.s[a-w][a-z]
diff --git a/tools/check_language.py b/tools/check_language.py
new file mode 100755
index 0000000..b7ca528
--- /dev/null
+++ b/tools/check_language.py
@@ -0,0 +1,179 @@
+#!/usr/bin/env python
+
+# Copyright 2020 The Amber Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Script to check files for inclusive language. The script will scan all files
+and flag non-inclusive terminology which is identified.
+
+Usage, run the script from a folder and the script will scan down through that
+folder.
+"""
+
+import fnmatch
+import os
+import re
+import sys
+
+REGEXES = [
+ r"(?i)black[-_]?list",
+ r"(?i)white[-_]?list",
+ r"(?i)gr[ea]y[-_]?list",
+ r"(?i)(first class citizen)",
+ r"(?i)black[-_]?hat",
+ r"(?i)white[-_]?hat",
+ r"(?i)gr[ea]y[-_]?hat",
+ r"(?i)master",
+ r"(?i)slave",
+ r"(?i)\bhim\b",
+ r"(?i)\bhis\b",
+ r"(?i)\bshe\b",
+ r"(?i)\bher\b",
+ r"(?i)\bhers\b",
+ r"(?i)\bman\b",
+ r"(?i)\bwoman\b",
+ r"(?i)\she\s",
+ r"(?i)\she$",
+ r"(?i)^he\s",
+ r"(?i)^he$",
+ r"(?i)\she['|\u2019]d\s",
+ r"(?i)\she['|\u2019]d$",
+ r"(?i)^he['|\u2019]d\s",
+ r"(?i)^he['|\u2019]d$",
+ r"(?i)\she['|\u2019]s\s",
+ r"(?i)\she['|\u2019]s$",
+ r"(?i)^he['|\u2019]s\s",
+ r"(?i)^he['|\u2019]s$",
+ r"(?i)\she['|\u2019]ll\s",
+ r"(?i)\she['|\u2019]ll$",
+ r"(?i)^he['|\u2019]ll\s",
+ r"(?i)^he['|\u2019]ll$",
+ r"(?i)grandfather",
+ r"(?i)\bmitm\b",
+ r"(?i)\bcrazy\b",
+ r"(?i)\binsane\b",
+ r"(?i)\bblind\sto\b",
+ r"(?i)\bflying\sblind\b",
+ r"(?i)\bblind\seye\b",
+ r"(?i)\bcripple\b",
+ r"(?i)\bcrippled\b",
+ r"(?i)\bdumb\b",
+ r"(?i)\bdummy\b",
+ r"(?i)\bparanoid\b",
+ r"(?i)\bsane\b",
+ r"(?i)\bsanity\b",
+ r"(?i)red[-_]?line",
+]
+
+SUPPRESSIONS = [
+ r"(?i)MS_SLAVE",
+ r"(?i)man[ -_]?page",
+]
+
+
+REGEX_LIST = []
+for reg in REGEXES:
+ REGEX_LIST.append(re.compile(reg))
+
+SUPPRESSION_LIST = []
+for supp in SUPPRESSIONS:
+ SUPPRESSION_LIST.append(re.compile(supp))
+
+def find(top, filename_glob, skip_glob_list):
+ """Returns files in the tree rooted at top matching filename_glob but not
+ in directories matching skip_glob_list."""
+
+ file_list = []
+ for path, dirs, files in os.walk(top):
+ for glob in skip_glob_list:
+ for match in fnmatch.filter(dirs, glob):
+ dirs.remove(match)
+ for filename in fnmatch.filter(files, filename_glob):
+ if filename == os.path.basename(__file__):
+ continue
+ file_list.append(os.path.join(path, filename))
+ return file_list
+
+
+def filtered_descendants(glob):
+ """Returns glob-matching filenames under the current directory, but skips
+ some irrelevant paths."""
+ return find('.', glob, ['third_party', 'external', 'build*', 'out*',
+ 'CompilerIdCXX', '.git'])
+
+def check_match(filename, contents):
+ """Check if contents contains any matching entries"""
+ ret = False
+ for reg in REGEX_LIST:
+ match = reg.search(contents)
+ if match:
+ suppressed = False
+ for supp in SUPPRESSION_LIST:
+ idx = match.start()
+ supp_match = supp.match(contents[idx:])
+ if supp_match:
+ suppressed = True
+
+ # This is a hack to handle the MS_ prefix that is needed
+ # to check for. Find a better way if we get more suppressions
+ # which modify the prefix of the string
+ if idx >= 3:
+ supp_match = supp.match(contents[idx - 3:])
+ if supp_match:
+ suppressed = True
+
+ if not suppressed:
+ # No matching suppression.
+ print("{}: found non-inclusive language: {}".format(
+ filename, match.group(0)))
+ ret = True
+
+ return ret
+
+
+def alert_if_lang_matches(glob):
+ """Prints names of all files matching non-inclusive language.
+
+ Finds all glob-matching files under the current directory and checks if they
+ contain the language pattern. Prints the names of all the files that
+ match.
+
+ Returns the total number of file names printed.
+ """
+ verbose = False
+ printed_count = 0
+ for file in filtered_descendants(glob):
+ has_match = False
+ try:
+ with open(file, 'r', encoding='utf8') as contents:
+ if check_match(file, contents.read()):
+ printed_count += 1
+ except:
+ if verbose:
+ print("skipping {}".format(file))
+
+ return printed_count
+
+
+def main():
+ globs = ['*']
+ count = 0
+ for glob in globs:
+ count += alert_if_lang_matches(glob)
+
+ sys.exit(count > 0)
+
+if __name__ == '__main__':
+ main()
diff --git a/tools/check_language_test.py b/tools/check_language_test.py
new file mode 100755
index 0000000..8f20791
--- /dev/null
+++ b/tools/check_language_test.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+
+# Copyright 2020 The Amber Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for check_language.py."""
+
+import os
+import sys
+import unittest
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import check_language
+
+class TestCheckLanguage(unittest.TestCase):
+ def testMatches(self):
+ tests = ["blacklist", "black-list", "black_list", "whitelist",
+ "white-list", "white_list", "greylist", "grey-list", "grey_list",
+ "graylist", "gray-list", "gray_list", "first class citizen",
+ "blackhat", "black-hat", "black_hat", "whitehat", "white-hat",
+ "white_hat", "greyhat", "grey-hat", "grey_hat", "grayhat",
+ "gray-hat", "gray_hat", "master", "slave", "him", "his", "she",
+ "her", "hers", "man", "woman", "he", "he'd", "he's", "he'll",
+ "he\u2019d", "he\u2019s", "he\u2019ll",
+ "grandfather", "mitm", "crazy", "insane", "blind to",
+ "flying blind", "blind eye", "cripple", "crippled", "dumb",
+ "dummy", "paranoid", "sane", "sanity", "redline", "red-line",
+ "red_line"]
+
+ for word in tests:
+ self.assertTrue(
+ check_language.check_match("", "this is a " + word + " attempt"), word)
+
+
+ def testSuppression(self):
+ self.assertFalse(check_language.check_match("", "in the man-pages"))
+ self.assertFalse(check_language.check_match("", "the MS_SLAVE test"))
+
+
+ def testMatchStartofFileWhenRequireSpace(self):
+ self.assertTrue(check_language.check_match("", "he said"))
+
+
+ def testMatchOverNewline(self):
+ self.assertTrue(check_language.check_match("", "flying\nblind"))
+
+
+if __name__ == '__main__':
+ unittest.main()