diff options
Diffstat (limited to 'tools/check_language.py')
-rwxr-xr-x | tools/check_language.py | 179 |
1 files changed, 179 insertions, 0 deletions
diff --git a/tools/check_language.py b/tools/check_language.py new file mode 100755 index 0000000..b7ca528 --- /dev/null +++ b/tools/check_language.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python + +# Copyright 2020 The Amber Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Script to check files for inclusive language. The script will scan all files +and flag non-inclusive terminology which is identified. + +Usage, run the script from a folder and the script will scan down through that +folder. +""" + +import fnmatch +import os +import re +import sys + +REGEXES = [ + r"(?i)black[-_]?list", + r"(?i)white[-_]?list", + r"(?i)gr[ea]y[-_]?list", + r"(?i)(first class citizen)", + r"(?i)black[-_]?hat", + r"(?i)white[-_]?hat", + r"(?i)gr[ea]y[-_]?hat", + r"(?i)master", + r"(?i)slave", + r"(?i)\bhim\b", + r"(?i)\bhis\b", + r"(?i)\bshe\b", + r"(?i)\bher\b", + r"(?i)\bhers\b", + r"(?i)\bman\b", + r"(?i)\bwoman\b", + r"(?i)\she\s", + r"(?i)\she$", + r"(?i)^he\s", + r"(?i)^he$", + r"(?i)\she['|\u2019]d\s", + r"(?i)\she['|\u2019]d$", + r"(?i)^he['|\u2019]d\s", + r"(?i)^he['|\u2019]d$", + r"(?i)\she['|\u2019]s\s", + r"(?i)\she['|\u2019]s$", + r"(?i)^he['|\u2019]s\s", + r"(?i)^he['|\u2019]s$", + r"(?i)\she['|\u2019]ll\s", + r"(?i)\she['|\u2019]ll$", + r"(?i)^he['|\u2019]ll\s", + r"(?i)^he['|\u2019]ll$", + r"(?i)grandfather", + r"(?i)\bmitm\b", + r"(?i)\bcrazy\b", + r"(?i)\binsane\b", + r"(?i)\bblind\sto\b", + r"(?i)\bflying\sblind\b", + r"(?i)\bblind\seye\b", + r"(?i)\bcripple\b", + r"(?i)\bcrippled\b", + r"(?i)\bdumb\b", + r"(?i)\bdummy\b", + r"(?i)\bparanoid\b", + r"(?i)\bsane\b", + r"(?i)\bsanity\b", + r"(?i)red[-_]?line", +] + +SUPPRESSIONS = [ + r"(?i)MS_SLAVE", + r"(?i)man[ -_]?page", +] + + +REGEX_LIST = [] +for reg in REGEXES: + REGEX_LIST.append(re.compile(reg)) + +SUPPRESSION_LIST = [] +for supp in SUPPRESSIONS: + SUPPRESSION_LIST.append(re.compile(supp)) + +def find(top, filename_glob, skip_glob_list): + """Returns files in the tree rooted at top matching filename_glob but not + in directories matching skip_glob_list.""" + + file_list = [] + for path, dirs, files in os.walk(top): + for glob in skip_glob_list: + for match in fnmatch.filter(dirs, glob): + dirs.remove(match) + for filename in fnmatch.filter(files, filename_glob): + if filename == os.path.basename(__file__): + continue + file_list.append(os.path.join(path, filename)) + return file_list + + +def filtered_descendants(glob): + """Returns glob-matching filenames under the current directory, but skips + some irrelevant paths.""" + return find('.', glob, ['third_party', 'external', 'build*', 'out*', + 'CompilerIdCXX', '.git']) + +def check_match(filename, contents): + """Check if contents contains any matching entries""" + ret = False + for reg in REGEX_LIST: + match = reg.search(contents) + if match: + suppressed = False + for supp in SUPPRESSION_LIST: + idx = match.start() + supp_match = supp.match(contents[idx:]) + if supp_match: + suppressed = True + + # This is a hack to handle the MS_ prefix that is needed + # to check for. Find a better way if we get more suppressions + # which modify the prefix of the string + if idx >= 3: + supp_match = supp.match(contents[idx - 3:]) + if supp_match: + suppressed = True + + if not suppressed: + # No matching suppression. + print("{}: found non-inclusive language: {}".format( + filename, match.group(0))) + ret = True + + return ret + + +def alert_if_lang_matches(glob): + """Prints names of all files matching non-inclusive language. + + Finds all glob-matching files under the current directory and checks if they + contain the language pattern. Prints the names of all the files that + match. + + Returns the total number of file names printed. + """ + verbose = False + printed_count = 0 + for file in filtered_descendants(glob): + has_match = False + try: + with open(file, 'r', encoding='utf8') as contents: + if check_match(file, contents.read()): + printed_count += 1 + except: + if verbose: + print("skipping {}".format(file)) + + return printed_count + + +def main(): + globs = ['*'] + count = 0 + for glob in globs: + count += alert_if_lang_matches(glob) + + sys.exit(count > 0) + +if __name__ == '__main__': + main() |