diff options
author | dan sinclair <dsinclair@google.com> | 2020-08-13 09:48:28 -0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-08-13 09:48:28 -0400 |
commit | 88f78401e9af26f1249944b942ddf5dd706572e8 (patch) | |
tree | 8c1346ef55a3ef92aa57cba44165bcd014357109 | |
parent | e5717280728970d6317ed896d1ae14acf123ebfc (diff) | |
download | amber-88f78401e9af26f1249944b942ddf5dd706572e8.tar.gz |
Simple script to check for inclusive language. (#897)
This CL adds a simple script to check for inclusive language. All files
in the repo are scanned and any matches are output.
-rw-r--r-- | .gitignore | 2 | ||||
-rwxr-xr-x | tools/check_language.py | 179 | ||||
-rwxr-xr-x | tools/check_language_test.py | 61 |
3 files changed, 242 insertions, 0 deletions
@@ -19,6 +19,8 @@ third_party/vulkan-loader third_party/vulkan-validationlayers/ .vs +*.pyc + # Vim swap files [._]*.s[a-w][a-z] diff --git a/tools/check_language.py b/tools/check_language.py new file mode 100755 index 0000000..b7ca528 --- /dev/null +++ b/tools/check_language.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python + +# Copyright 2020 The Amber Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Script to check files for inclusive language. The script will scan all files +and flag non-inclusive terminology which is identified. + +Usage, run the script from a folder and the script will scan down through that +folder. +""" + +import fnmatch +import os +import re +import sys + +REGEXES = [ + r"(?i)black[-_]?list", + r"(?i)white[-_]?list", + r"(?i)gr[ea]y[-_]?list", + r"(?i)(first class citizen)", + r"(?i)black[-_]?hat", + r"(?i)white[-_]?hat", + r"(?i)gr[ea]y[-_]?hat", + r"(?i)master", + r"(?i)slave", + r"(?i)\bhim\b", + r"(?i)\bhis\b", + r"(?i)\bshe\b", + r"(?i)\bher\b", + r"(?i)\bhers\b", + r"(?i)\bman\b", + r"(?i)\bwoman\b", + r"(?i)\she\s", + r"(?i)\she$", + r"(?i)^he\s", + r"(?i)^he$", + r"(?i)\she['|\u2019]d\s", + r"(?i)\she['|\u2019]d$", + r"(?i)^he['|\u2019]d\s", + r"(?i)^he['|\u2019]d$", + r"(?i)\she['|\u2019]s\s", + r"(?i)\she['|\u2019]s$", + r"(?i)^he['|\u2019]s\s", + r"(?i)^he['|\u2019]s$", + r"(?i)\she['|\u2019]ll\s", + r"(?i)\she['|\u2019]ll$", + r"(?i)^he['|\u2019]ll\s", + r"(?i)^he['|\u2019]ll$", + r"(?i)grandfather", + r"(?i)\bmitm\b", + r"(?i)\bcrazy\b", + r"(?i)\binsane\b", + r"(?i)\bblind\sto\b", + r"(?i)\bflying\sblind\b", + r"(?i)\bblind\seye\b", + r"(?i)\bcripple\b", + r"(?i)\bcrippled\b", + r"(?i)\bdumb\b", + r"(?i)\bdummy\b", + r"(?i)\bparanoid\b", + r"(?i)\bsane\b", + r"(?i)\bsanity\b", + r"(?i)red[-_]?line", +] + +SUPPRESSIONS = [ + r"(?i)MS_SLAVE", + r"(?i)man[ -_]?page", +] + + +REGEX_LIST = [] +for reg in REGEXES: + REGEX_LIST.append(re.compile(reg)) + +SUPPRESSION_LIST = [] +for supp in SUPPRESSIONS: + SUPPRESSION_LIST.append(re.compile(supp)) + +def find(top, filename_glob, skip_glob_list): + """Returns files in the tree rooted at top matching filename_glob but not + in directories matching skip_glob_list.""" + + file_list = [] + for path, dirs, files in os.walk(top): + for glob in skip_glob_list: + for match in fnmatch.filter(dirs, glob): + dirs.remove(match) + for filename in fnmatch.filter(files, filename_glob): + if filename == os.path.basename(__file__): + continue + file_list.append(os.path.join(path, filename)) + return file_list + + +def filtered_descendants(glob): + """Returns glob-matching filenames under the current directory, but skips + some irrelevant paths.""" + return find('.', glob, ['third_party', 'external', 'build*', 'out*', + 'CompilerIdCXX', '.git']) + +def check_match(filename, contents): + """Check if contents contains any matching entries""" + ret = False + for reg in REGEX_LIST: + match = reg.search(contents) + if match: + suppressed = False + for supp in SUPPRESSION_LIST: + idx = match.start() + supp_match = supp.match(contents[idx:]) + if supp_match: + suppressed = True + + # This is a hack to handle the MS_ prefix that is needed + # to check for. Find a better way if we get more suppressions + # which modify the prefix of the string + if idx >= 3: + supp_match = supp.match(contents[idx - 3:]) + if supp_match: + suppressed = True + + if not suppressed: + # No matching suppression. + print("{}: found non-inclusive language: {}".format( + filename, match.group(0))) + ret = True + + return ret + + +def alert_if_lang_matches(glob): + """Prints names of all files matching non-inclusive language. + + Finds all glob-matching files under the current directory and checks if they + contain the language pattern. Prints the names of all the files that + match. + + Returns the total number of file names printed. + """ + verbose = False + printed_count = 0 + for file in filtered_descendants(glob): + has_match = False + try: + with open(file, 'r', encoding='utf8') as contents: + if check_match(file, contents.read()): + printed_count += 1 + except: + if verbose: + print("skipping {}".format(file)) + + return printed_count + + +def main(): + globs = ['*'] + count = 0 + for glob in globs: + count += alert_if_lang_matches(glob) + + sys.exit(count > 0) + +if __name__ == '__main__': + main() diff --git a/tools/check_language_test.py b/tools/check_language_test.py new file mode 100755 index 0000000..8f20791 --- /dev/null +++ b/tools/check_language_test.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python + +# Copyright 2020 The Amber Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for check_language.py.""" + +import os +import sys +import unittest + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import check_language + +class TestCheckLanguage(unittest.TestCase): + def testMatches(self): + tests = ["blacklist", "black-list", "black_list", "whitelist", + "white-list", "white_list", "greylist", "grey-list", "grey_list", + "graylist", "gray-list", "gray_list", "first class citizen", + "blackhat", "black-hat", "black_hat", "whitehat", "white-hat", + "white_hat", "greyhat", "grey-hat", "grey_hat", "grayhat", + "gray-hat", "gray_hat", "master", "slave", "him", "his", "she", + "her", "hers", "man", "woman", "he", "he'd", "he's", "he'll", + "he\u2019d", "he\u2019s", "he\u2019ll", + "grandfather", "mitm", "crazy", "insane", "blind to", + "flying blind", "blind eye", "cripple", "crippled", "dumb", + "dummy", "paranoid", "sane", "sanity", "redline", "red-line", + "red_line"] + + for word in tests: + self.assertTrue( + check_language.check_match("", "this is a " + word + " attempt"), word) + + + def testSuppression(self): + self.assertFalse(check_language.check_match("", "in the man-pages")) + self.assertFalse(check_language.check_match("", "the MS_SLAVE test")) + + + def testMatchStartofFileWhenRequireSpace(self): + self.assertTrue(check_language.check_match("", "he said")) + + + def testMatchOverNewline(self): + self.assertTrue(check_language.check_match("", "flying\nblind")) + + +if __name__ == '__main__': + unittest.main() |