Move `TopologicalSortMessagesInFile` to helpers.cc

We would use this function in helpers.cc for identifying the proto fields to inline.  Better to have it there.

PiperOrigin-RevId: 580982766
pull/14697/head
Protobuf Team Bot 1 year ago committed by Copybara-Service
parent b1370701fd
commit 1c8ee7c44e
  1. 148
      src/google/protobuf/compiler/cpp/file.cc
  2. 149
      src/google/protobuf/compiler/cpp/helpers.cc
  3. 3
      src/google/protobuf/compiler/cpp/helpers.h

@ -88,154 +88,6 @@ void UnmuteWuninitialized(io::Printer* p) {
#endif // __llvm__
)");
}
// TopologicalSortMessagesInFile topologically sorts and returns a vector of
// proto descriptors defined in the file provided as input. The underlying
// graph is defined using dependency relationship between protos. For example,
// if proto A contains proto B as a member, then proto B would be ordered before
// proto A in a topological ordering, assuming there is no mutual dependence
// between the two protos. The topological order is used to emit proto
// declarations so that a proto is declared after all the protos it is dependent
// on have been declared (again assuming no mutual dependence). This is needed
// in cases where we may declare proto B as a member of proto A using an object,
// instead of a pointer.
//
// The proto dependency graph can have cycles. So instead of directly working
// with protos, we compute strong connected components (SCCs) composed of protos
// with mutual dependence. The dependency graph on SCCs is a directed acyclic
// graph (DAG) and therefore a topological order can be computed for it i.e. an
// order where an SCC is ordered after all other SCCs it is dependent on have
// been ordered.
//
// The function below first constructs the SCC graph and then computes a
// deterministic topological order for the graph.
//
// For computing the SCC graph, we follow the following steps:
// 1. Collect the descriptors for the messages in the file.
// 2. Construct a map for descriptor to SCC mapping.
// 3. Construct a map for dependence between SCCs, referred to as
// child_to_parent_scc_map below. This map constructed by running a BFS on the
// SCCs.
//
// For computing a deterministic topological order on the graph computed in step
// 3 above, we do the following:
// 1. Since the graph on SCCs is a DAG, therefore there will be at least one SCC
// that does not depend on other SCCs. We first construct a list of all such
// SCCs.
// 2. Next we run a BFS starting with the list of SCCs computed in step 1. For
// each SCC, we track the number of the SCC it is dependent on and the number of
// those SCC that have been ordered. Once all the SCCs an SCC is dependent on
// have been ordered, this SCC is added to list of SCCs that are to be ordered
// next.
// 3. Within an SCC, the descriptors are ordered on the basis of the full_name()
// of the descriptors.
std::vector<const Descriptor*> TopologicalSortMessagesInFile(
const FileDescriptor* file, MessageSCCAnalyzer& scc_analyzer) {
// Collect the messages defined in this file.
std::vector<const Descriptor*> messages_in_file = FlattenMessagesInFile(file);
if (messages_in_file.empty()) return {};
// Populate the map from the descriptor to the SCC to which the descriptor
// belongs.
absl::flat_hash_map<const Descriptor*, const SCC*> descriptor_to_scc_map;
descriptor_to_scc_map.reserve(messages_in_file.size());
for (const Descriptor* d : messages_in_file) {
descriptor_to_scc_map.emplace(d, scc_analyzer.GetSCC(d));
}
ABSL_DCHECK(messages_in_file.size() == descriptor_to_scc_map.size())
<< "messages_in_file has duplicate messages!";
// Each parent SCC has information about the child SCCs i.e. SCCs for fields
// that are contained in the protos that belong to the parent SCC. Use this
// information to construct the inverse map from child SCC to parent SCC.
absl::flat_hash_map<const SCC*, absl::flat_hash_set<const SCC*>>
child_to_parent_scc_map;
// For recording the number of edges from each SCC to other SCCs in the
// forward map.
absl::flat_hash_map<const SCC*, int> scc_to_outgoing_edges_map;
std::queue<const SCC*> sccs_to_process;
for (const auto& p : descriptor_to_scc_map) {
sccs_to_process.push(p.second);
}
// Run a BFS to fill the two data structures: child_to_parent_scc_map and
// scc_to_outgoing_edges_map.
while (!sccs_to_process.empty()) {
const SCC* scc = sccs_to_process.front();
sccs_to_process.pop();
auto& count = scc_to_outgoing_edges_map[scc];
for (const auto& child : scc->children) {
// Test whether this child has been seen thus far. We do not know if the
// children SCC vector contains unique children SCC.
auto& parent_set = child_to_parent_scc_map[child];
if (parent_set.empty()) {
// Just added.
sccs_to_process.push(child);
}
auto ret = parent_set.insert(scc);
if (ret.second) {
++count;
}
}
}
std::vector<const SCC*> next_scc_q;
// Find out the SCCs that do not have an outgoing edge i.e. the protos in this
// SCC do not depend on protos other than the ones in this SCC.
for (const auto& p : scc_to_outgoing_edges_map) {
if (p.second == 0) {
next_scc_q.push_back(p.first);
}
}
ABSL_DCHECK(!next_scc_q.empty()) << "No independent components!";
// Topologically sort the SCCs.
// If an SCC no longer has an outgoing edge i.e. all the SCCs it depends on
// have been ordered, then this SCC is now a candidate for ordering.
std::vector<const Descriptor*> sorted_messages;
while (!next_scc_q.empty()) {
std::vector<const SCC*> current_scc_q;
current_scc_q.swap(next_scc_q);
// SCCs present in the current_scc_q are topologically equivalent to each
// other. Therefore they can be added to the output in any order. We sort
// these SCCs by the full_name() of the first descriptor that belongs to the
// SCC. This works well since the descriptors in each SCC are sorted by
// full_name() and also that a descriptor can be part of only one SCC.
std::sort(current_scc_q.begin(), current_scc_q.end(),
[](const SCC* a, const SCC* b) {
ABSL_DCHECK(!a->descriptors.empty()) << "No descriptors!";
ABSL_DCHECK(!b->descriptors.empty()) << "No descriptors!";
const Descriptor* ad = a->descriptors[0];
const Descriptor* bd = b->descriptors[0];
return ad->full_name() < bd->full_name();
});
while (!current_scc_q.empty()) {
const SCC* scc = current_scc_q.back();
current_scc_q.pop_back();
// Messages in an SCC are already sorted on full_name(). So we can emit
// them right away.
for (const Descriptor* d : scc->descriptors) {
// Only push messages that are defined in the file.
if (descriptor_to_scc_map.contains(d)) {
sorted_messages.push_back(d);
}
}
// Find all the SCCs that are dependent on the current SCC.
const auto& parents = child_to_parent_scc_map.find(scc);
if (parents == child_to_parent_scc_map.end()) continue;
for (const SCC* parent : parents->second) {
auto it = scc_to_outgoing_edges_map.find(parent);
ABSL_CHECK(it != scc_to_outgoing_edges_map.end());
ABSL_CHECK(it->second > 0);
// Reduce the dependency count for the SCC. In case the dependency
// count reaches 0, add the SCC to the list of SCCs to be ordered next.
it->second--;
if (it->second == 0) {
next_scc_q.push_back(parent);
}
}
}
}
for (const auto& p : scc_to_outgoing_edges_map) {
ABSL_DCHECK(p.second == 0) << "SCC left behind!";
}
return sorted_messages;
}
} // namespace
bool FileGenerator::ShouldSkipDependencyImports(

@ -15,6 +15,7 @@
#include <cstdint>
#include <limits>
#include <memory>
#include <queue>
#include <string>
#include <utility>
#include <vector>
@ -1317,6 +1318,154 @@ void FlattenMessagesInFile(const FileDescriptor* file,
}
}
// TopologicalSortMessagesInFile topologically sorts and returns a vector of
// proto descriptors defined in the file provided as input. The underlying
// graph is defined using dependency relationship between protos. For example,
// if proto A contains proto B as a member, then proto B would be ordered before
// proto A in a topological ordering, assuming there is no mutual dependence
// between the two protos. The topological order is used to emit proto
// declarations so that a proto is declared after all the protos it is dependent
// on have been declared (again assuming no mutual dependence). This is needed
// in cases where we may declare proto B as a member of proto A using an object,
// instead of a pointer.
//
// The proto dependency graph can have cycles. So instead of directly working
// with protos, we compute strong connected components (SCCs) composed of protos
// with mutual dependence. The dependency graph on SCCs is a directed acyclic
// graph (DAG) and therefore a topological order can be computed for it i.e. an
// order where an SCC is ordered after all other SCCs it is dependent on have
// been ordered.
//
// The function below first constructs the SCC graph and then computes a
// deterministic topological order for the graph.
//
// For computing the SCC graph, we follow the following steps:
// 1. Collect the descriptors for the messages in the file.
// 2. Construct a map for descriptor to SCC mapping.
// 3. Construct a map for dependence between SCCs, referred to as
// child_to_parent_scc_map below. This map constructed by running a BFS on the
// SCCs.
//
// For computing a deterministic topological order on the graph computed in step
// 3 above, we do the following:
// 1. Since the graph on SCCs is a DAG, therefore there will be at least one SCC
// that does not depend on other SCCs. We first construct a list of all such
// SCCs.
// 2. Next we run a BFS starting with the list of SCCs computed in step 1. For
// each SCC, we track the number of the SCC it is dependent on and the number of
// those SCC that have been ordered. Once all the SCCs an SCC is dependent on
// have been ordered, this SCC is added to list of SCCs that are to be ordered
// next.
// 3. Within an SCC, the descriptors are ordered on the basis of the full_name()
// of the descriptors.
std::vector<const Descriptor*> TopologicalSortMessagesInFile(
const FileDescriptor* file, MessageSCCAnalyzer& scc_analyzer) {
// Collect the messages defined in this file.
std::vector<const Descriptor*> messages_in_file = FlattenMessagesInFile(file);
if (messages_in_file.empty()) return {};
// Populate the map from the descriptor to the SCC to which the descriptor
// belongs.
absl::flat_hash_map<const Descriptor*, const SCC*> descriptor_to_scc_map;
descriptor_to_scc_map.reserve(messages_in_file.size());
for (const Descriptor* d : messages_in_file) {
descriptor_to_scc_map.emplace(d, scc_analyzer.GetSCC(d));
}
ABSL_DCHECK(messages_in_file.size() == descriptor_to_scc_map.size())
<< "messages_in_file has duplicate messages!";
// Each parent SCC has information about the child SCCs i.e. SCCs for fields
// that are contained in the protos that belong to the parent SCC. Use this
// information to construct the inverse map from child SCC to parent SCC.
absl::flat_hash_map<const SCC*, absl::flat_hash_set<const SCC*>>
child_to_parent_scc_map;
// For recording the number of edges from each SCC to other SCCs in the
// forward map.
absl::flat_hash_map<const SCC*, int> scc_to_outgoing_edges_map;
std::queue<const SCC*> sccs_to_process;
for (const auto& p : descriptor_to_scc_map) {
sccs_to_process.push(p.second);
}
// Run a BFS to fill the two data structures: child_to_parent_scc_map and
// scc_to_outgoing_edges_map.
while (!sccs_to_process.empty()) {
const SCC* scc = sccs_to_process.front();
sccs_to_process.pop();
auto& count = scc_to_outgoing_edges_map[scc];
for (const auto& child : scc->children) {
// Test whether this child has been seen thus far. We do not know if the
// children SCC vector contains unique children SCC.
auto& parent_set = child_to_parent_scc_map[child];
if (parent_set.empty()) {
// Just added.
sccs_to_process.push(child);
}
auto ret = parent_set.insert(scc);
if (ret.second) {
++count;
}
}
}
std::vector<const SCC*> next_scc_q;
// Find out the SCCs that do not have an outgoing edge i.e. the protos in this
// SCC do not depend on protos other than the ones in this SCC.
for (const auto& p : scc_to_outgoing_edges_map) {
if (p.second == 0) {
next_scc_q.push_back(p.first);
}
}
ABSL_DCHECK(!next_scc_q.empty()) << "No independent components!";
// Topologically sort the SCCs.
// If an SCC no longer has an outgoing edge i.e. all the SCCs it depends on
// have been ordered, then this SCC is now a candidate for ordering.
std::vector<const Descriptor*> sorted_messages;
while (!next_scc_q.empty()) {
std::vector<const SCC*> current_scc_q;
current_scc_q.swap(next_scc_q);
// SCCs present in the current_scc_q are topologically equivalent to each
// other. Therefore they can be added to the output in any order. We sort
// these SCCs by the full_name() of the first descriptor that belongs to the
// SCC. This works well since the descriptors in each SCC are sorted by
// full_name() and also that a descriptor can be part of only one SCC.
std::sort(current_scc_q.begin(), current_scc_q.end(),
[](const SCC* a, const SCC* b) {
ABSL_DCHECK(!a->descriptors.empty()) << "No descriptors!";
ABSL_DCHECK(!b->descriptors.empty()) << "No descriptors!";
const Descriptor* ad = a->descriptors[0];
const Descriptor* bd = b->descriptors[0];
return ad->full_name() < bd->full_name();
});
while (!current_scc_q.empty()) {
const SCC* scc = current_scc_q.back();
current_scc_q.pop_back();
// Messages in an SCC are already sorted on full_name(). So we can emit
// them right away.
for (const Descriptor* d : scc->descriptors) {
// Only push messages that are defined in the file.
if (descriptor_to_scc_map.contains(d)) {
sorted_messages.push_back(d);
}
}
// Find all the SCCs that are dependent on the current SCC.
const auto& parents = child_to_parent_scc_map.find(scc);
if (parents == child_to_parent_scc_map.end()) continue;
for (const SCC* parent : parents->second) {
auto it = scc_to_outgoing_edges_map.find(parent);
ABSL_CHECK(it != scc_to_outgoing_edges_map.end());
ABSL_CHECK(it->second > 0);
// Reduce the dependency count for the SCC. In case the dependency
// count reaches 0, add the SCC to the list of SCCs to be ordered next.
it->second--;
if (it->second == 0) {
next_scc_q.push_back(parent);
}
}
}
}
for (const auto& p : scc_to_outgoing_edges_map) {
ABSL_DCHECK(p.second == 0) << "SCC left behind!";
}
return sorted_messages;
}
bool HasWeakFields(const Descriptor* descriptor, const Options& options) {
for (int i = 0; i < descriptor->field_count(); i++) {
if (IsWeak(descriptor->field(i), options)) return true;

@ -629,6 +629,9 @@ inline std::vector<const Descriptor*> FlattenMessagesInFile(
return result;
}
std::vector<const Descriptor*> TopologicalSortMessagesInFile(
const FileDescriptor* file, MessageSCCAnalyzer& scc_analyzer);
template <typename F>
void ForEachMessage(const Descriptor* descriptor, F&& func) {
for (int i = 0; i < descriptor->nested_type_count(); i++)

Loading…
Cancel
Save