diff --git a/tools/buildgen/extract_metadata_from_bazel_xml.py b/tools/buildgen/extract_metadata_from_bazel_xml.py index 228a8e29437..c5fc9305643 100755 --- a/tools/buildgen/extract_metadata_from_bazel_xml.py +++ b/tools/buildgen/extract_metadata_from_bazel_xml.py @@ -13,6 +13,23 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Script to extract build metadata from bazel BUILD. +# To avoid having two sources of truth for the build metadata (build +# targets, source files, header files etc.), this script analyzes the contents +# of bazel BUILD files and generates a YAML file (currently called +# build_autogenerated.yaml). The format and semantics of the generated YAML files +# is chosen to match the format of a "build.yaml" file, which used +# to be build the source of truth for gRPC build before bazel became +# the primary build system. +# A good basic overview of the "build.yaml" format is available here: +# https://github.com/grpc/grpc/blob/master/templates/README.md. Note that +# while useful as an overview, the doc does not act as formal spec +# (formal spec does not exist in fact) and the doc can be incomplete, +# inaccurate or slightly out of date. +# TODO(jtattermusch): In the future we want to get rid of the legacy build.yaml +# format entirely or simplify it to a point where it becomes self-explanatory +# and doesn't need any detailed documentation. + import subprocess import yaml import xml.etree.ElementTree as ET @@ -32,6 +49,7 @@ def _bazel_query_xml_tree(query): def _rule_dict_from_xml_node(rule_xml_node): + """Converts XML node representing a rule (obtained from "bazel query --output xml") to a dictionary that contains all the metadata we will need.""" result = { 'class': rule_xml_node.attrib.get('class'), 'name': rule_xml_node.attrib.get('name'), @@ -63,6 +81,7 @@ def _rule_dict_from_xml_node(rule_xml_node): def _extract_rules_from_bazel_xml(xml_tree): + """Extract bazel rules from an XML tree node obtained from "bazel query --output xml" command.""" result = {} for child in xml_tree: if child.tag == 'rule': @@ -133,8 +152,13 @@ def _extract_deps(bazel_rule): def _create_target_from_bazel_rule(target_name, bazel_rules): - # extract the deps from bazel + """Create build.yaml-like target definition from bazel metadata""" bazel_rule = bazel_rules[_get_bazel_label(target_name)] + + # Create a template for our target from the bazel rule. Initially we only + # populate some "private" fields with the original info we got from bazel + # and only later we will populate the public fields (once we do some extra + # postprocessing). result = { 'name': target_name, '_PUBLIC_HEADERS_BAZEL': _extract_public_headers(bazel_rule), @@ -312,22 +336,37 @@ def _expand_intermediate_deps(target_dict, public_dep_names, bazel_rules): def _generate_build_metadata(build_extra_metadata, bazel_rules): + """Generate build metadata in build.yaml-like format bazel build metadata and build.yaml-specific "extra metadata".""" lib_names = build_extra_metadata.keys() result = {} for lib_name in lib_names: lib_dict = _create_target_from_bazel_rule(lib_name, bazel_rules) + # Figure out the final list of headers and sources for given target. + # While this is mostly based on bazel build metadata, build.yaml does + # not necessarily expose all the targets that are present in bazel build. + # These "intermediate dependencies" might get flattened. + # TODO(jtattermusch): This is done to avoid introducing too many intermediate + # libraries into the build.yaml-based builds (which might in cause issues + # building language-specific artifacts) and also because the libraries + # in build.yaml-based build are generally considered units of distributions + # (= public libraries that are visible to the user and are installable), + # while in bazel builds it is customary to define larger number of smaller + # "sublibraries". The need for elision (and expansion) + # of intermediate libraries can be re-evaluated in the future. _expand_intermediate_deps(lib_dict, lib_names, bazel_rules) - # populate extra properties from build metadata + # populate extra properties from the build.yaml-specific "extra metadata" lib_dict.update(build_extra_metadata.get(lib_name, {})) # store to results result[lib_name] = lib_dict - # rename some targets to something else - # this needs to be made after we're done with most of processing logic + # Rename targets marked with "_RENAME" extra metadata. + # This is mostly a cosmetic change to ensure that we end up with build.yaml target + # names we're used to from the past (and also to avoid too long target names). + # The rename step needs to be made after we're done with most of processing logic # otherwise the already-renamed libraries will have different names than expected for lib_name in lib_names: to_name = build_extra_metadata.get(lib_name, {}).get('_RENAME', None) @@ -410,8 +449,8 @@ def _extract_cc_tests(bazel_rules): return list(sorted(result)) -def _filter_cc_tests(tests): - """Filters out tests that we don't want or we cannot build them reasonably""" +def _exclude_unwanted_cc_tests(tests): + """Filters out bazel tests that we don't want to run with other build systems or we cannot build them reasonably""" # most qps tests are autogenerated, we are fine without them tests = list( @@ -478,6 +517,7 @@ def _filter_cc_tests(tests): def _generate_build_extra_metadata_for_tests(tests, bazel_rules): + """For given tests, generate the "extra metadata" that we need for our "build.yaml"-like output. The extra metadata is generated from the bazel rule metadata by using a bunch of heuristics.""" test_metadata = {} for test in tests: test_dict = {'build': 'test', '_TYPE': 'target'} @@ -573,6 +613,16 @@ def _generate_build_extra_metadata_for_tests(tests, bazel_rules): return test_metadata +def _detect_and_print_issues(build_yaml_like): + """Try detecting some unusual situations and warn about them.""" + for tgt in build_yaml_like['targets']: + if tgt['build'] == 'test': + for src in tgt['src']: + if src.startswith('src/') and not src.endswith('.proto'): + print('source file from under "src/" tree used in test ' + + tgt['name'] + ': ' + src) + + # extra metadata that will be used to construct build.yaml # there are mostly extra properties that we weren't able to obtain from the bazel build # _TYPE: whether this is library, target or test @@ -943,31 +993,138 @@ _BAZEL_DEPS_QUERIES = [ 'deps("//src/proto/...")', ] +# Step 1: run a bunch of "bazel query --output xml" queries to collect +# the raw build metadata from the bazel build. +# At the end of this step we will have a dictionary of bazel rules +# that are interesting to us (libraries, binaries, etc.) along +# with their most important metadata (sources, headers, dependencies) +# +# Example of a single bazel rule after being populated: +# '//:grpc' : { 'class': 'cc_library', +# 'hdrs': ['//:include/grpc/byte_buffer.h', ... ], +# 'srcs': ['//:src/core/lib/surface/init.cc', ... ], +# 'deps': ['//:grpc_common', ...], +# ... } bazel_rules = {} for query in _BAZEL_DEPS_QUERIES: bazel_rules.update( _extract_rules_from_bazel_xml(_bazel_query_xml_tree(query))) +# Step 1a: Knowing the transitive closure of dependencies will make +# the postprocessing simpler, so compute the info for all our rules. +# +# Example: +# '//:grpc' : { ..., +# 'transitive_deps': ['//:gpr_base', ...] } _populate_transitive_deps(bazel_rules) -tests = _filter_cc_tests(_extract_cc_tests(bazel_rules)) -test_metadata = _generate_build_extra_metadata_for_tests(tests, bazel_rules) - -all_metadata = {} -all_metadata.update(_BUILD_EXTRA_METADATA) -all_metadata.update(test_metadata) - -all_targets_dict = _generate_build_metadata(all_metadata, bazel_rules) +# Step 2: Extract the known bazel cc_test tests. While most tests +# will be buildable with other build systems just fine, some of these tests +# would be too difficult to build and run with other build systems, +# so we simply exclude the ones we don't want. +# Note that while making tests buildable with other build systems +# than just bazel is extra effort, we still need to do that for these +# reasons: +# - If our cmake build doesn't have any tests at all, it's hard to make +# sure that what it built actually works (we need at least some "smoke tests"). +# This is quite important because the build flags between bazel / non-bazel flag might differ +# (sometimes it's for interesting reasons that are not easy to overcome) +# which makes it even more important to have at least some tests for cmake/make +# - Our portability suite actually runs cmake tests and migration of portability +# suite fully towards bazel might be intricate (e.g. it's unclear whether it's +# possible to get a good enough coverage of different compilers / distros etc. +# with bazel) +# - some things that are considered "tests" in build.yaml-based builds are actually binaries +# we'd want to be able to build anyway (qps_json_worker, interop_client, interop_server, grpc_cli) +# so it's unclear how much make/cmake simplification we would gain by removing just some (but not all) test +# TODO(jtattermusch): Investigate feasibility of running portability suite with bazel. +tests = _exclude_unwanted_cc_tests(_extract_cc_tests(bazel_rules)) + +# Step 3: Generate the "extra metadata" for all our build targets. +# While the bazel rules give us most of the information we need, +# the legacy "build.yaml" format requires some additional fields that +# we cannot get just from bazel alone (we call that "extra metadata"). +# In this step, we basically analyze the build metadata we have from bazel +# and use heuristics to determine (and sometimes guess) the right +# extra metadata to use for each target. +# +# - For some targets (such as the public libraries, helper libraries +# and executables) determining the right extra metadata is hard to do +# automatically. For these targets, the extra metadata is supplied "manually" +# in form of the _BUILD_EXTRA_METADATA dictionary. That allows us to match +# the semantics of the legacy "build.yaml" as closely as possible. +# +# - For test binaries, it is possible to generate the "extra metadata" mostly +# automatically using a rule-based heuristic approach because most tests +# look and behave alike from the build's perspective. +# +# TODO(jtattermusch): Of course neither "_BUILD_EXTRA_METADATA" or +# the heuristic approach used for tests are ideal and they cannot be made +# to cover all possible situations (and are tailored to work with the way +# the grpc build currently works), but the idea was to start with something +# reasonably simple that matches the "build.yaml"-like semantics as closely +# as possible (to avoid changing too many things at once) and gradually get +# rid of the legacy "build.yaml"-specific fields one by one. Once that is done, +# only very little "extra metadata" would be needed and/or it would be trivial +# to generate it automatically. +all_extra_metadata = {} +all_extra_metadata.update(_BUILD_EXTRA_METADATA) +all_extra_metadata.update( + _generate_build_extra_metadata_for_tests(tests, bazel_rules)) + +# Step 4: Generate the final metadata for all the targets. +# This is done by combining the bazel build metadata and the "extra metadata" +# we obtained in the previous step. +# In this step, we also perform some interesting massaging of the target metadata +# to end up with a result that is as similar to the legacy build.yaml data +# as possible. +# - Some targets get renamed (to match the legacy build.yaml target names) +# - Some intermediate libraries get elided ("expanded") to better match the set +# of targets provided by the legacy build.yaml build +# +# Originally the target renaming was introduced to address these concerns: +# - avoid changing too many things at the same time and avoid people getting +# confused by some well know targets suddenly being missing +# - Makefile/cmake and also language-specific generators rely on some build +# targets being called exactly the way they they are. Some of our testing +# scrips also invoke executables (e.g. "qps_json_driver") by their name. +# - The autogenerated test name from bazel includes the package path +# (e.g. "test_cpp_TEST_NAME"). Without renaming, the target names would +# end up pretty ugly (e.g. test_cpp_qps_qps_json_driver). +# TODO(jtattermusch): reevaluate the need for target renaming in the future. +# +# Example of a single generated target: +# 'grpc' : { 'language': 'c', +# 'public_headers': ['include/grpc/byte_buffer.h', ... ], +# 'headers': ['src/core/ext/filters/client_channel/client_channel.h', ... ], +# 'src': ['src/core/lib/surface/init.cc', ... ], +# 'deps': ['gpr', 'address_sorting', ...], +# ... } +all_targets_dict = _generate_build_metadata(all_extra_metadata, bazel_rules) + +# Step 5: convert the dictionary with all the targets to a dict that has +# the desired "build.yaml"-like layout. +# TODO(jtattermusch): We use the custom "build.yaml"-like layout because +# currently all other build systems use that format as their source of truth. +# In the future, we can get rid of this custom & legacy format entirely, +# but we would need to update the generators for other build systems +# at the same time. +# +# Layout of the result: +# { 'libs': { TARGET_DICT_FOR_LIB_XYZ, ... }, +# 'targets': { TARGET_DICT_FOR_BIN_XYZ, ... }, +# 'tests': { TARGET_DICT_FOR_TEST_XYZ, ...} } build_yaml_like = _convert_to_build_yaml_like(all_targets_dict) -# if a test uses source files from src/ directly, it's a little bit suspicious -for tgt in build_yaml_like['targets']: - if tgt['build'] == 'test': - for src in tgt['src']: - if src.startswith('src/') and not src.endswith('.proto'): - print('source file from under "src/" tree used in test ' + - tgt['name'] + ': ' + src) +# detect and report some suspicious situations we've seen before +_detect_and_print_issues(build_yaml_like) +# Step 6: Store the build_autogenerated.yaml in a deterministic (=sorted) +# and cleaned-up form. +# A basic overview of the resulting "build.yaml"-like format is here: +# https://github.com/grpc/grpc/blob/master/templates/README.md +# TODO(jtattermusch): The "cleanup" function is taken from the legacy +# build system (which used build.yaml) and can be eventually removed. build_yaml_string = build_cleaner.cleaned_build_yaml_dict_as_string( build_yaml_like) with open('build_autogenerated.yaml', 'w') as file: