Merge branch 'main' into minitable-api

pull/13171/head
Joshua Haberman 3 years ago
commit 2a79bf734f
  1. 161
      BUILD
  2. 6
      WORKSPACE
  3. 21
      bazel/BUILD
  4. 134
      bazel/py_proto_library.bzl
  5. 7
      bazel/workspace_deps.bzl
  6. 4
      cmake/make_cmakelists.py
  7. 58
      python/BUILD
  8. 200
      tests/BUILD
  9. 94
      tests/bindings/lua/BUILD
  10. 0
      tests/conformance_upb_failures.txt
  11. 1
      tests/corpus/README
  12. 26
      tests/corpus/temp.cc
  13. 8
      third_party/utf8_range/BUILD
  14. 92
      third_party/utf8_range/naive.c
  15. 157
      third_party/utf8_range/range2-neon.c
  16. 170
      third_party/utf8_range/range2-sse.c
  17. 395
      third_party/utf8_range/utf8_range.c
  18. 2
      third_party/utf8_range/utf8_range.h
  19. 5
      upb/bindings/README
  20. 63
      upb/bindings/lua/BUILD
  21. 8
      upb/bindings/lua/README.md
  22. 2
      upb/bindings/lua/main.c
  23. 0
      upb/bindings/lua/test.proto
  24. 2
      upb/bindings/lua/test_upb.lua
  25. 0
      upb/conformance_upb.c
  26. 1
      upb/conformance_upb_failures.txt
  27. 0
      upb/empty.proto
  28. 0
      upb/file_descriptor_parsenew_fuzzer.cc
  29. 0
      upb/test.proto
  30. 4
      upb/test_cpp.cc
  31. 0
      upb/test_cpp.proto
  32. 2
      upb/test_generated_code.cc
  33. 0
      upb/test_table.cc

161
BUILD

@ -26,6 +26,8 @@
load( load(
"//bazel:build_defs.bzl", "//bazel:build_defs.bzl",
"UPB_DEFAULT_COPTS", "UPB_DEFAULT_COPTS",
"UPB_DEFAULT_CPPOPTS",
"make_shell_script",
"upb_amalgamation", # copybara:strip_for_google3 "upb_amalgamation", # copybara:strip_for_google3
) )
load( load(
@ -35,6 +37,7 @@ load(
"upb_proto_library_copts", "upb_proto_library_copts",
"upb_proto_reflection_library", "upb_proto_reflection_library",
) )
load("@rules_fuzzing//fuzzing:cc_defs.bzl", "cc_fuzz_test")
licenses(["notice"]) licenses(["notice"])
@ -238,6 +241,47 @@ cc_library(
], ],
) )
# Tests ########################################################################
cc_test(
name = "test_generated_code",
srcs = ["upb/test_generated_code.cc"],
deps = [
":empty_upbdefs_proto",
":test_messages_proto3_proto_upb",
":test_upb_proto",
"@com_google_googletest//:gtest_main",
],
)
proto_library(
name = "test_proto",
testonly = 1,
srcs = ["upb/test.proto"],
)
upb_proto_library(
name = "test_upb_proto",
testonly = 1,
deps = [":test_proto"],
)
proto_library(
name = "empty_proto",
srcs = ["upb/empty.proto"],
)
upb_proto_reflection_library(
name = "empty_upbdefs_proto",
testonly = 1,
deps = [":empty_proto"],
)
upb_proto_library(
name = "test_messages_proto3_proto_upb",
testonly = 1,
deps = ["@com_google_protobuf//:test_messages_proto3_proto"],
)
cc_test( cc_test(
name = "msg_test", name = "msg_test",
srcs = ["upb/msg_test.cc"], srcs = ["upb/msg_test.cc"],
@ -259,6 +303,121 @@ upb_proto_reflection_library(
deps = [":msg_test_proto"], deps = [":msg_test_proto"],
) )
proto_library(
name = "test_cpp_proto",
srcs = ["upb/test_cpp.proto"],
deps = ["@com_google_protobuf//:timestamp_proto"]
)
upb_proto_library(
name = "test_cpp_upb_proto",
deps = ["test_cpp_proto"],
)
upb_proto_reflection_library(
name = "test_cpp_upb_proto_reflection",
deps = ["test_cpp_proto"],
)
cc_test(
name = "test_cpp",
srcs = ["upb/test_cpp.cc"],
copts = UPB_DEFAULT_CPPOPTS,
deps = [
":test_cpp_upb_proto",
":test_cpp_upb_proto_reflection",
"//:json",
"//:port",
"//:reflection",
"//:upb",
"@com_google_googletest//:gtest_main",
],
)
cc_test(
name = "test_table",
srcs = ["upb/test_table.cc"],
copts = UPB_DEFAULT_CPPOPTS,
deps = [
"//:port",
"//:table",
"//:upb",
"@com_google_googletest//:gtest_main",
],
)
cc_fuzz_test(
name = "file_descriptor_parsenew_fuzzer",
srcs = ["upb/file_descriptor_parsenew_fuzzer.cc"],
deps = [
"//:descriptor_upb_proto",
"//:upb",
],
)
upb_proto_library(
name = "conformance_proto_upb",
testonly = 1,
deps = ["@com_google_protobuf//:conformance_proto"],
)
upb_proto_reflection_library(
name = "conformance_proto_upbdefs",
testonly = 1,
deps = ["@com_google_protobuf//:conformance_proto"],
)
upb_proto_reflection_library(
name = "test_messages_proto2_upbdefs",
testonly = 1,
deps = ["@com_google_protobuf//:test_messages_proto2_proto"],
)
upb_proto_reflection_library(
name = "test_messages_proto3_upbdefs",
testonly = 1,
deps = ["@com_google_protobuf//:test_messages_proto3_proto"],
)
cc_binary(
name = "conformance_upb",
testonly = 1,
srcs = ["upb/conformance_upb.c"],
copts = UPB_DEFAULT_COPTS,
data = ["upb/conformance_upb_failures.txt"],
deps = [
":conformance_proto_upb",
":conformance_proto_upbdefs",
":test_messages_proto2_upbdefs",
":test_messages_proto3_upbdefs",
"//:json",
"//:port",
"//:reflection",
"//:textformat",
"//:upb",
],
)
make_shell_script(
name = "gen_test_conformance_upb",
out = "test_conformance_upb.sh",
contents = "external/com_google_protobuf/conformance_test_runner " +
" --enforce_recommended " +
" --failure_list ./upb/conformance_upb_failures.txt" +
" ./conformance_upb",
)
sh_test(
name = "test_conformance_upb",
srcs = ["test_conformance_upb.sh"],
data = [
"upb/conformance_upb_failures.txt",
":conformance_upb",
"@com_google_protobuf//:conformance_test_runner",
],
deps = ["@bazel_tools//tools/bash/runfiles"],
)
# Internal C/C++ libraries ##################################################### # Internal C/C++ libraries #####################################################
cc_library( cc_library(
@ -365,7 +524,7 @@ exports_files(
"third_party/lunit/console.lua", "third_party/lunit/console.lua",
"third_party/lunit/lunit.lua", "third_party/lunit/lunit.lua",
], ],
visibility = ["//tests/bindings/lua:__pkg__"], visibility = ["//upb/bindings/lua:__pkg__"],
) )
filegroup( filegroup(

@ -47,9 +47,9 @@ register_toolchains("@system_python//:python_toolchain")
http_archive( http_archive(
name = "rules_fuzzing", name = "rules_fuzzing",
sha256 = "e1b54211f7cee604194db080a8765220d3ef5db2a873fded429ce13e74d93a6b", sha256 = "23bb074064c6f488d12044934ab1b0631e8e6898d5cf2f6bde087adb01111573",
strip_prefix = "rules_fuzzing-4bafba51ffd9d418d236adb61de36fda1a90e764", strip_prefix = "rules_fuzzing-0.3.1",
urls = ["https://github.com/bazelbuild/rules_fuzzing/archive/4bafba51ffd9d418d236adb61de36fda1a90e764.zip"], urls = ["https://github.com/bazelbuild/rules_fuzzing/archive/v0.3.1.zip"],
) )
load("@rules_fuzzing//fuzzing:repositories.bzl", "rules_fuzzing_dependencies") load("@rules_fuzzing//fuzzing:repositories.bzl", "rules_fuzzing_dependencies")

@ -23,8 +23,29 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
py_binary( py_binary(
name = "amalgamate", name = "amalgamate",
srcs = ["amalgamate.py"], srcs = ["amalgamate.py"],
visibility = ["//:__pkg__"], visibility = ["//:__pkg__"],
) )
# py_proto_library() is private rule, only intended for internal use by upb.
# Hopefully py_proto_library() will eventually be availble in rules_proto or
# another upstream package.
bzl_library(
name = "py_proto_library_bzl",
srcs = ["py_proto_library.bzl"],
)
bzl_library(
name = "upb_proto_library_bzl",
srcs = ["upb_proto_library.bzl"],
visibility = ["//visibility:public"],
deps = [
"@bazel_skylib//lib:paths",
"@bazel_tools//tools/cpp:toolchain_utils.bzl",
"@rules_proto//proto:defs",
],
)

@ -0,0 +1,134 @@
# Copyright (c) 2009-2021, Google LLC
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Google LLC nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL Google LLC BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""An implementation of py_proto_library().
We have to implement this ourselves because there is currently no reasonable
py_proto_library() rule available for Bazel.
Our py_proto_library() is similar to how a real py_proto_library() should work.
But it hasn't been deeply tested or reviewed, and upb should not be in the
business of vending py_proto_library(), so we keep it private to upb.
"""
load("@bazel_skylib//lib:paths.bzl", "paths")
load("@rules_proto//proto:defs.bzl", "ProtoInfo") # copybara:strip_for_google3
# Generic support code #########################################################
def _get_real_short_path(file):
# For some reason, files from other archives have short paths that look like:
# ../com_google_protobuf/google/protobuf/descriptor.proto
short_path = file.short_path
if short_path.startswith("../"):
second_slash = short_path.index("/", 3)
short_path = short_path[second_slash + 1:]
# Sometimes it has another few prefixes like:
# _virtual_imports/any_proto/google/protobuf/any.proto
# benchmarks/_virtual_imports/100_msgs_proto/benchmarks/100_msgs.proto
# We want just google/protobuf/any.proto.
virtual_imports = "_virtual_imports/"
if virtual_imports in short_path:
short_path = short_path.split(virtual_imports)[1].split("/", 1)[1]
return short_path
def _get_real_root(file):
real_short_path = _get_real_short_path(file)
return file.path[:-len(real_short_path) - 1]
def _generate_output_file(ctx, src, extension):
real_short_path = _get_real_short_path(src)
real_short_path = paths.relativize(real_short_path, ctx.label.package)
output_filename = paths.replace_extension(real_short_path, extension)
ret = ctx.actions.declare_file(output_filename)
return ret
# py_proto_library() ###########################################################
def _py_proto_library_rule_impl(ctx):
# A real py_proto_library() should enforce this constraint.
# We don't bother for now, since it saves us some effort not to.
#
# if len(ctx.attr.deps) != 1:
# fail("only one deps dependency allowed.")
files = []
for dep in ctx.attr.deps:
files += dep[PyInfo].transitive_sources.to_list()
return [
DefaultInfo(files = depset(direct = files)),
]
def _py_proto_library_aspect_impl(target, ctx):
proto_info = target[ProtoInfo]
proto_sources = proto_info.direct_sources
srcs = [_generate_output_file(ctx, name, "_pb2.py") for name in proto_sources]
transitive_sets = proto_info.transitive_descriptor_sets.to_list()
ctx.actions.run(
inputs = depset(
direct = [proto_info.direct_descriptor_set],
transitive = [proto_info.transitive_descriptor_sets],
),
outputs = srcs,
executable = ctx.executable._protoc,
arguments = [
"--python_out=" + _get_real_root(srcs[0]),
"--descriptor_set_in=" + ctx.configuration.host_path_separator.join([f.path for f in transitive_sets]),
] +
[_get_real_short_path(file) for file in proto_sources],
progress_message = "Generating Python protos for :" + ctx.label.name,
)
outs_depset = depset(srcs)
return [
PyInfo(transitive_sources = outs_depset)
]
_py_proto_library_aspect = aspect(
attrs = {
"_protoc": attr.label(
executable = True,
cfg = "exec",
default = "@com_google_protobuf//:protoc",
),
},
implementation = _py_proto_library_aspect_impl,
provides = [
PyInfo,
],
attr_aspects = ["deps"],
)
py_proto_library = rule(
output_to_genfiles = True,
implementation = _py_proto_library_rule_impl,
attrs = {
"deps": attr.label_list(
aspects = [_py_proto_library_aspect],
allow_rules = ["proto_library"],
providers = [ProtoInfo],
),
},
)

@ -29,11 +29,14 @@ def upb_deps():
] ]
) )
rules_python_version = "740825b7f74930c62f44af95c9a4c1bd428d2c53" # Latest @ 2021-06-23
maybe( maybe(
http_archive, http_archive,
name = "rules_python", name = "rules_python",
url = "https://github.com/bazelbuild/rules_python/releases/download/0.1.0/rules_python-0.1.0.tar.gz", strip_prefix = "rules_python-{}".format(rules_python_version),
sha256 = "b6d46438523a3ec0f3cead544190ee13223a52f6a6765a29eae7b7cc24cc83a0", url = "https://github.com/bazelbuild/rules_python/archive/{}.zip".format(rules_python_version),
sha256 = "09a3c4791c61b62c2cbc5b2cbea4ccc32487b38c7a2cc8f87a794d7a659cc742",
) )
maybe( maybe(

@ -121,6 +121,9 @@ class BuildFileFunctions(object):
# self._add_deps(kwargs) # self._add_deps(kwargs)
pass pass
def cc_fuzz_test(self, **kwargs):
pass
def py_library(self, **kwargs): def py_library(self, **kwargs):
pass pass
@ -311,6 +314,7 @@ converter = Converter()
def GetDict(obj): def GetDict(obj):
ret = {} ret = {}
ret["UPB_DEFAULT_COPTS"] = [] # HACK ret["UPB_DEFAULT_COPTS"] = [] # HACK
ret["UPB_DEFAULT_CPPOPTS"] = [] # HACK
for k in dir(obj): for k in dir(obj):
if not k.startswith("_"): if not k.startswith("_"):
ret[k] = getattr(obj, k); ret[k] = getattr(obj, k);

@ -27,6 +27,14 @@ load(
"//bazel:build_defs.bzl", "//bazel:build_defs.bzl",
"UPB_DEFAULT_COPTS", "UPB_DEFAULT_COPTS",
) )
load(
"//bazel:py_proto_library.bzl",
"py_proto_library",
)
load(
"@rules_python//python:packaging.bzl",
"py_wheel",
)
cc_binary( cc_binary(
name = "message", name = "message",
@ -108,12 +116,17 @@ genrule(
visibility = ["//python:__subpackages__"], visibility = ["//python:__subpackages__"],
) )
py_library( filegroup(
name = "message_ext", name = "extension_files",
data = [ srcs = [
"google/protobuf/pyext/_message" + EXT_SUFFIX, "google/protobuf/pyext/_message" + EXT_SUFFIX,
"google/protobuf/internal/_api_implementation" + EXT_SUFFIX, "google/protobuf/internal/_api_implementation" + EXT_SUFFIX,
], ],
)
py_library(
name = "message_ext",
data = [":extension_files"],
imports = ["."], imports = ["."],
visibility = ["//python:__subpackages__"], visibility = ["//python:__subpackages__"],
) )
@ -132,3 +145,42 @@ py_test(
imports = ["."], imports = ["."],
legacy_create_init = False, legacy_create_init = False,
) )
py_proto_library(
name = "well_known_proto_pb2",
deps = [
"@com_google_protobuf//:any_proto",
"@com_google_protobuf//:api_proto",
"@com_google_protobuf//:compiler_plugin_proto",
"@com_google_protobuf//:descriptor_proto",
"@com_google_protobuf//:duration_proto",
"@com_google_protobuf//:empty_proto",
"@com_google_protobuf//:field_mask_proto",
"@com_google_protobuf//:source_context_proto",
"@com_google_protobuf//:struct_proto",
"@com_google_protobuf//:timestamp_proto",
"@com_google_protobuf//:type_proto",
"@com_google_protobuf//:wrappers_proto",
],
)
py_wheel(
name = "binary_wheel",
abi = "abi3",
distribution = "protobuf",
python_tag = "cp36",
# TODO(https://github.com/protocolbuffers/upb/issues/502): we need to make
# this a select() that is calculated from the platform we are actually
# building on.
platform = "manylinux2014_x86_64",
version = "4.20.0",
strip_path_prefixes = ["python/"],
deps = [
":extension_files",
":well_known_proto_pb2",
# TODO(https://github.com/protocolbuffers/upb/issues/503): currently
# this includes the unit tests. We should filter these out so we are
# only distributing true source files.
"@com_google_protobuf//:python_srcs",
],
)

@ -1,200 +0,0 @@
# Copyright (c) 2009-2021, Google LLC
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Google LLC nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL Google LLC BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
load(
"//bazel:build_defs.bzl",
"UPB_DEFAULT_COPTS",
"UPB_DEFAULT_CPPOPTS",
"make_shell_script",
)
load(
"//bazel:upb_proto_library.bzl",
"upb_proto_library",
"upb_proto_reflection_library",
)
load("@rules_fuzzing//fuzzing:cc_defs.bzl", "cc_fuzz_test")
licenses(["notice"])
proto_library(
name = "test_proto",
testonly = 1,
srcs = ["test.proto"],
)
upb_proto_library(
name = "test_upb_proto",
testonly = 1,
deps = [":test_proto"],
)
cc_test(
name = "test_generated_code",
srcs = ["test_generated_code.cc"],
deps = [
":empty_upbdefs_proto",
":test_messages_proto3_proto_upb",
":test_upb_proto",
"@com_google_googletest//:gtest_main",
],
)
proto_library(
name = "empty_proto",
srcs = ["empty.proto"],
)
upb_proto_reflection_library(
name = "empty_upbdefs_proto",
testonly = 1,
deps = [":empty_proto"],
)
upb_proto_library(
name = "test_messages_proto3_proto_upb",
testonly = 1,
deps = ["@com_google_protobuf//:test_messages_proto3_proto"],
)
proto_library(
name = "test_cpp_proto",
srcs = [
"test_cpp.proto",
],
deps = ["@com_google_protobuf//:timestamp_proto"]
)
upb_proto_library(
name = "test_cpp_upb_proto",
deps = ["test_cpp_proto"],
)
upb_proto_reflection_library(
name = "test_cpp_upb_proto_reflection",
deps = ["test_cpp_proto"],
)
cc_test(
name = "test_cpp",
srcs = ["test_cpp.cc"],
copts = UPB_DEFAULT_CPPOPTS,
deps = [
":test_cpp_upb_proto",
":test_cpp_upb_proto_reflection",
"//:json",
"//:port",
"//:reflection",
"//:upb",
"@com_google_googletest//:gtest_main",
],
)
cc_test(
name = "test_table",
srcs = ["test_table.cc"],
copts = UPB_DEFAULT_CPPOPTS,
deps = [
"//:port",
"//:table",
"//:upb",
"@com_google_googletest//:gtest_main",
],
)
cc_fuzz_test(
name = "file_descriptor_parsenew_fuzzer",
srcs = ["file_descriptor_parsenew_fuzzer.cc"],
deps = [
"//:descriptor_upb_proto",
"//:upb",
],
)
upb_proto_library(
name = "conformance_proto_upb",
testonly = 1,
deps = ["@com_google_protobuf//:conformance_proto"],
)
upb_proto_reflection_library(
name = "conformance_proto_upbdefs",
testonly = 1,
deps = ["@com_google_protobuf//:conformance_proto"],
)
upb_proto_reflection_library(
name = "test_messages_proto2_upbdefs",
testonly = 1,
deps = ["@com_google_protobuf//:test_messages_proto2_proto"],
)
upb_proto_reflection_library(
name = "test_messages_proto3_upbdefs",
testonly = 1,
deps = ["@com_google_protobuf//:test_messages_proto3_proto"],
)
cc_binary(
name = "conformance_upb",
testonly = 1,
srcs = [
"conformance_upb.c",
],
copts = UPB_DEFAULT_COPTS,
data = [
"conformance_upb_failures.txt",
],
deps = [
":conformance_proto_upb",
":conformance_proto_upbdefs",
":test_messages_proto2_upbdefs",
":test_messages_proto3_upbdefs",
"//:json",
"//:port",
"//:reflection",
"//:textformat",
"//:upb",
],
)
make_shell_script(
name = "gen_test_conformance_upb",
out = "test_conformance_upb.sh",
contents = "external/com_google_protobuf/conformance_test_runner " +
" --enforce_recommended " +
" --failure_list ./tests/conformance_upb_failures.txt" +
" ./tests/conformance_upb",
)
sh_test(
name = "test_conformance_upb",
srcs = ["test_conformance_upb.sh"],
data = [
"conformance_upb_failures.txt",
":conformance_upb",
"@com_google_protobuf//:conformance_test_runner",
],
deps = ["@bazel_tools//tools/bash/runfiles"],
)

@ -1,94 +0,0 @@
# Copyright (c) 2009-2021, Google LLC
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Google LLC nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL Google LLC BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
load(
"//upb/bindings/lua:lua_proto_library.bzl",
"lua_proto_library",
)
load(
"//bazel:build_defs.bzl",
"UPB_DEFAULT_COPTS",
)
licenses(["notice"])
cc_test(
name = "test_lua",
srcs = ["main.c"],
copts = UPB_DEFAULT_COPTS,
data = [
"test_upb.lua",
":descriptor_proto_lua",
":empty_proto_lua",
":test_messages_proto2_proto_lua",
":test_messages_proto3_proto_lua",
":test_proto_lua",
"//:third_party/lunit/console.lua",
"//:third_party/lunit/lunit.lua",
"//upb/bindings/lua:upb.lua",
"@com_google_protobuf//:conformance_proto",
"@com_google_protobuf//:descriptor_proto",
],
linkstatic = 1,
deps = [
"//upb/bindings/lua:lupb",
"@lua//:liblua",
],
)
proto_library(
name = "test_proto",
testonly = 1,
srcs = ["test.proto"],
deps = ["@com_google_protobuf//:timestamp_proto"],
)
lua_proto_library(
name = "test_proto_lua",
testonly = 1,
deps = [":test_proto"],
)
lua_proto_library(
name = "descriptor_proto_lua",
deps = ["@com_google_protobuf//:descriptor_proto"],
)
lua_proto_library(
name = "empty_proto_lua",
deps = ["@com_google_protobuf//:empty_proto"],
)
lua_proto_library(
name = "test_messages_proto3_proto_lua",
testonly = 1,
deps = ["@com_google_protobuf//:test_messages_proto3_proto"],
)
lua_proto_library(
name = "test_messages_proto2_proto_lua",
testonly = 1,
deps = ["@com_google_protobuf//:test_messages_proto2_proto"],
)

@ -1 +0,0 @@
Corpus folder for fuzzing

@ -1,26 +0,0 @@
// Copyright (c) 2009-2021, Google LLC
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of Google LLC nor the
// names of its contributors may be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL Google LLC BE LIABLE FOR ANY DIRECT,
// INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// Hello World

@ -1,8 +1,14 @@
# Pulled from: https://github.com/cyb70289/utf8
cc_library( cc_library(
name = "utf8_range", name = "utf8_range",
hdrs = ["utf8_range.h"], hdrs = ["utf8_range.h"],
srcs = ["utf8_range.c"], srcs = [
"naive.c",
"range2-neon.c",
"range2-sse.c",
],
visibility = ["//:__pkg__"], visibility = ["//:__pkg__"],
) )

@ -0,0 +1,92 @@
#include <stdio.h>
/*
* http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
*
* Table 3-7. Well-Formed UTF-8 Byte Sequences
*
* +--------------------+------------+-------------+------------+-------------+
* | Code Points | First Byte | Second Byte | Third Byte | Fourth Byte |
* +--------------------+------------+-------------+------------+-------------+
* | U+0000..U+007F | 00..7F | | | |
* +--------------------+------------+-------------+------------+-------------+
* | U+0080..U+07FF | C2..DF | 80..BF | | |
* +--------------------+------------+-------------+------------+-------------+
* | U+0800..U+0FFF | E0 | A0..BF | 80..BF | |
* +--------------------+------------+-------------+------------+-------------+
* | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | |
* +--------------------+------------+-------------+------------+-------------+
* | U+D000..U+D7FF | ED | 80..9F | 80..BF | |
* +--------------------+------------+-------------+------------+-------------+
* | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | |
* +--------------------+------------+-------------+------------+-------------+
* | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF |
* +--------------------+------------+-------------+------------+-------------+
* | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF |
* +--------------------+------------+-------------+------------+-------------+
* | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF |
* +--------------------+------------+-------------+------------+-------------+
*/
/* Return 0 - success, >0 - index(1 based) of first error char */
int utf8_naive(const unsigned char *data, int len)
{
int err_pos = 1;
while (len) {
int bytes;
const unsigned char byte1 = data[0];
/* 00..7F */
if (byte1 <= 0x7F) {
bytes = 1;
/* C2..DF, 80..BF */
} else if (len >= 2 && byte1 >= 0xC2 && byte1 <= 0xDF &&
(signed char)data[1] <= (signed char)0xBF) {
bytes = 2;
} else if (len >= 3) {
const unsigned char byte2 = data[1];
/* Is byte2, byte3 between 0x80 ~ 0xBF */
const int byte2_ok = (signed char)byte2 <= (signed char)0xBF;
const int byte3_ok = (signed char)data[2] <= (signed char)0xBF;
if (byte2_ok && byte3_ok &&
/* E0, A0..BF, 80..BF */
((byte1 == 0xE0 && byte2 >= 0xA0) ||
/* E1..EC, 80..BF, 80..BF */
(byte1 >= 0xE1 && byte1 <= 0xEC) ||
/* ED, 80..9F, 80..BF */
(byte1 == 0xED && byte2 <= 0x9F) ||
/* EE..EF, 80..BF, 80..BF */
(byte1 >= 0xEE && byte1 <= 0xEF))) {
bytes = 3;
} else if (len >= 4) {
/* Is byte4 between 0x80 ~ 0xBF */
const int byte4_ok = (signed char)data[3] <= (signed char)0xBF;
if (byte2_ok && byte3_ok && byte4_ok &&
/* F0, 90..BF, 80..BF, 80..BF */
((byte1 == 0xF0 && byte2 >= 0x90) ||
/* F1..F3, 80..BF, 80..BF, 80..BF */
(byte1 >= 0xF1 && byte1 <= 0xF3) ||
/* F4, 80..8F, 80..BF, 80..BF */
(byte1 == 0xF4 && byte2 <= 0x8F))) {
bytes = 4;
} else {
return err_pos;
}
} else {
return err_pos;
}
} else {
return err_pos;
}
len -= bytes;
err_pos += bytes;
data += bytes;
}
return 0;
}

@ -0,0 +1,157 @@
/*
* Process 2x16 bytes in each iteration.
* Comments removed for brevity. See range-neon.c for details.
*/
#if defined(__aarch64__) && defined(__ARM_NEON)
#include <stdio.h>
#include <stdint.h>
#include <arm_neon.h>
int utf8_naive(const unsigned char *data, int len);
static const uint8_t _first_len_tbl[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3,
};
static const uint8_t _first_range_tbl[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8,
};
static const uint8_t _range_min_tbl[] = {
0x00, 0x80, 0x80, 0x80, 0xA0, 0x80, 0x90, 0x80,
0xC2, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
};
static const uint8_t _range_max_tbl[] = {
0x7F, 0xBF, 0xBF, 0xBF, 0xBF, 0x9F, 0xBF, 0x8F,
0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
};
static const uint8_t _range_adjust_tbl[] = {
2, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0,
};
/* Return 0 on success, -1 on error */
int utf8_range2(const unsigned char *data, int len)
{
if (len >= 32) {
uint8x16_t prev_input = vdupq_n_u8(0);
uint8x16_t prev_first_len = vdupq_n_u8(0);
const uint8x16_t first_len_tbl = vld1q_u8(_first_len_tbl);
const uint8x16_t first_range_tbl = vld1q_u8(_first_range_tbl);
const uint8x16_t range_min_tbl = vld1q_u8(_range_min_tbl);
const uint8x16_t range_max_tbl = vld1q_u8(_range_max_tbl);
const uint8x16x2_t range_adjust_tbl = vld2q_u8(_range_adjust_tbl);
const uint8x16_t const_1 = vdupq_n_u8(1);
const uint8x16_t const_2 = vdupq_n_u8(2);
const uint8x16_t const_e0 = vdupq_n_u8(0xE0);
uint8x16_t error1 = vdupq_n_u8(0);
uint8x16_t error2 = vdupq_n_u8(0);
uint8x16_t error3 = vdupq_n_u8(0);
uint8x16_t error4 = vdupq_n_u8(0);
while (len >= 32) {
/******************* two blocks interleaved **********************/
#if defined(__GNUC__) && !defined(__clang__) && (__GNUC__ < 8)
/* gcc doesn't support vldq1_u8_x2 until version 8 */
const uint8x16_t input_a = vld1q_u8(data);
const uint8x16_t input_b = vld1q_u8(data + 16);
#else
/* Forces a double load on Clang */
const uint8x16x2_t input_pair = vld1q_u8_x2(data);
const uint8x16_t input_a = input_pair.val[0];
const uint8x16_t input_b = input_pair.val[1];
#endif
const uint8x16_t high_nibbles_a = vshrq_n_u8(input_a, 4);
const uint8x16_t high_nibbles_b = vshrq_n_u8(input_b, 4);
const uint8x16_t first_len_a =
vqtbl1q_u8(first_len_tbl, high_nibbles_a);
const uint8x16_t first_len_b =
vqtbl1q_u8(first_len_tbl, high_nibbles_b);
uint8x16_t range_a = vqtbl1q_u8(first_range_tbl, high_nibbles_a);
uint8x16_t range_b = vqtbl1q_u8(first_range_tbl, high_nibbles_b);
range_a =
vorrq_u8(range_a, vextq_u8(prev_first_len, first_len_a, 15));
range_b =
vorrq_u8(range_b, vextq_u8(first_len_a, first_len_b, 15));
uint8x16_t tmp1_a, tmp2_a, tmp1_b, tmp2_b;
tmp1_a = vextq_u8(prev_first_len, first_len_a, 14);
tmp1_a = vqsubq_u8(tmp1_a, const_1);
range_a = vorrq_u8(range_a, tmp1_a);
tmp1_b = vextq_u8(first_len_a, first_len_b, 14);
tmp1_b = vqsubq_u8(tmp1_b, const_1);
range_b = vorrq_u8(range_b, tmp1_b);
tmp2_a = vextq_u8(prev_first_len, first_len_a, 13);
tmp2_a = vqsubq_u8(tmp2_a, const_2);
range_a = vorrq_u8(range_a, tmp2_a);
tmp2_b = vextq_u8(first_len_a, first_len_b, 13);
tmp2_b = vqsubq_u8(tmp2_b, const_2);
range_b = vorrq_u8(range_b, tmp2_b);
uint8x16_t shift1_a = vextq_u8(prev_input, input_a, 15);
uint8x16_t pos_a = vsubq_u8(shift1_a, const_e0);
range_a = vaddq_u8(range_a, vqtbl2q_u8(range_adjust_tbl, pos_a));
uint8x16_t shift1_b = vextq_u8(input_a, input_b, 15);
uint8x16_t pos_b = vsubq_u8(shift1_b, const_e0);
range_b = vaddq_u8(range_b, vqtbl2q_u8(range_adjust_tbl, pos_b));
uint8x16_t minv_a = vqtbl1q_u8(range_min_tbl, range_a);
uint8x16_t maxv_a = vqtbl1q_u8(range_max_tbl, range_a);
uint8x16_t minv_b = vqtbl1q_u8(range_min_tbl, range_b);
uint8x16_t maxv_b = vqtbl1q_u8(range_max_tbl, range_b);
error1 = vorrq_u8(error1, vcltq_u8(input_a, minv_a));
error2 = vorrq_u8(error2, vcgtq_u8(input_a, maxv_a));
error3 = vorrq_u8(error3, vcltq_u8(input_b, minv_b));
error4 = vorrq_u8(error4, vcgtq_u8(input_b, maxv_b));
/************************ next iteration *************************/
prev_input = input_b;
prev_first_len = first_len_b;
data += 32;
len -= 32;
}
error1 = vorrq_u8(error1, error2);
error1 = vorrq_u8(error1, error3);
error1 = vorrq_u8(error1, error4);
if (vmaxvq_u8(error1))
return -1;
uint32_t token4;
vst1q_lane_u32(&token4, vreinterpretq_u32_u8(prev_input), 3);
const int8_t *token = (const int8_t *)&token4;
int lookahead = 0;
if (token[3] > (int8_t)0xBF)
lookahead = 1;
else if (token[2] > (int8_t)0xBF)
lookahead = 2;
else if (token[1] > (int8_t)0xBF)
lookahead = 3;
data -= lookahead;
len += lookahead;
}
return utf8_naive(data, len);
}
#endif

@ -0,0 +1,170 @@
/*
* Process 2x16 bytes in each iteration.
* Comments removed for brevity. See range-sse.c for details.
*/
#ifdef __SSE4_1__
#include <stdio.h>
#include <stdint.h>
#include <x86intrin.h>
int utf8_naive(const unsigned char *data, int len);
static const int8_t _first_len_tbl[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3,
};
static const int8_t _first_range_tbl[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8,
};
static const int8_t _range_min_tbl[] = {
0x00, 0x80, 0x80, 0x80, 0xA0, 0x80, 0x90, 0x80,
0xC2, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
};
static const int8_t _range_max_tbl[] = {
0x7F, 0xBF, 0xBF, 0xBF, 0xBF, 0x9F, 0xBF, 0x8F,
0xF4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
};
static const int8_t _df_ee_tbl[] = {
0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0,
};
static const int8_t _ef_fe_tbl[] = {
0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
/* Return 0 on success, -1 on error */
int utf8_range2(const unsigned char *data, int len)
{
if (len >= 32) {
__m128i prev_input = _mm_set1_epi8(0);
__m128i prev_first_len = _mm_set1_epi8(0);
const __m128i first_len_tbl =
_mm_loadu_si128((const __m128i *)_first_len_tbl);
const __m128i first_range_tbl =
_mm_loadu_si128((const __m128i *)_first_range_tbl);
const __m128i range_min_tbl =
_mm_loadu_si128((const __m128i *)_range_min_tbl);
const __m128i range_max_tbl =
_mm_loadu_si128((const __m128i *)_range_max_tbl);
const __m128i df_ee_tbl =
_mm_loadu_si128((const __m128i *)_df_ee_tbl);
const __m128i ef_fe_tbl =
_mm_loadu_si128((const __m128i *)_ef_fe_tbl);
__m128i error = _mm_set1_epi8(0);
while (len >= 32) {
/***************************** block 1 ****************************/
const __m128i input_a = _mm_loadu_si128((const __m128i *)data);
__m128i high_nibbles =
_mm_and_si128(_mm_srli_epi16(input_a, 4), _mm_set1_epi8(0x0F));
__m128i first_len_a = _mm_shuffle_epi8(first_len_tbl, high_nibbles);
__m128i range_a = _mm_shuffle_epi8(first_range_tbl, high_nibbles);
range_a = _mm_or_si128(
range_a, _mm_alignr_epi8(first_len_a, prev_first_len, 15));
__m128i tmp;
tmp = _mm_alignr_epi8(first_len_a, prev_first_len, 14);
tmp = _mm_subs_epu8(tmp, _mm_set1_epi8(1));
range_a = _mm_or_si128(range_a, tmp);
tmp = _mm_alignr_epi8(first_len_a, prev_first_len, 13);
tmp = _mm_subs_epu8(tmp, _mm_set1_epi8(2));
range_a = _mm_or_si128(range_a, tmp);
__m128i shift1, pos, range2;
shift1 = _mm_alignr_epi8(input_a, prev_input, 15);
pos = _mm_sub_epi8(shift1, _mm_set1_epi8(0xEF));
tmp = _mm_subs_epu8(pos, _mm_set1_epi8(0xF0));
range2 = _mm_shuffle_epi8(df_ee_tbl, tmp);
tmp = _mm_adds_epu8(pos, _mm_set1_epi8(0x70));
range2 = _mm_add_epi8(range2, _mm_shuffle_epi8(ef_fe_tbl, tmp));
range_a = _mm_add_epi8(range_a, range2);
__m128i minv = _mm_shuffle_epi8(range_min_tbl, range_a);
__m128i maxv = _mm_shuffle_epi8(range_max_tbl, range_a);
tmp = _mm_or_si128(
_mm_cmplt_epi8(input_a, minv),
_mm_cmpgt_epi8(input_a, maxv)
);
error = _mm_or_si128(error, tmp);
/***************************** block 2 ****************************/
const __m128i input_b = _mm_loadu_si128((const __m128i *)(data+16));
high_nibbles =
_mm_and_si128(_mm_srli_epi16(input_b, 4), _mm_set1_epi8(0x0F));
__m128i first_len_b = _mm_shuffle_epi8(first_len_tbl, high_nibbles);
__m128i range_b = _mm_shuffle_epi8(first_range_tbl, high_nibbles);
range_b = _mm_or_si128(
range_b, _mm_alignr_epi8(first_len_b, first_len_a, 15));
tmp = _mm_alignr_epi8(first_len_b, first_len_a, 14);
tmp = _mm_subs_epu8(tmp, _mm_set1_epi8(1));
range_b = _mm_or_si128(range_b, tmp);
tmp = _mm_alignr_epi8(first_len_b, first_len_a, 13);
tmp = _mm_subs_epu8(tmp, _mm_set1_epi8(2));
range_b = _mm_or_si128(range_b, tmp);
shift1 = _mm_alignr_epi8(input_b, input_a, 15);
pos = _mm_sub_epi8(shift1, _mm_set1_epi8(0xEF));
tmp = _mm_subs_epu8(pos, _mm_set1_epi8(0xF0));
range2 = _mm_shuffle_epi8(df_ee_tbl, tmp);
tmp = _mm_adds_epu8(pos, _mm_set1_epi8(0x70));
range2 = _mm_add_epi8(range2, _mm_shuffle_epi8(ef_fe_tbl, tmp));
range_b = _mm_add_epi8(range_b, range2);
minv = _mm_shuffle_epi8(range_min_tbl, range_b);
maxv = _mm_shuffle_epi8(range_max_tbl, range_b);
tmp = _mm_or_si128(
_mm_cmplt_epi8(input_b, minv),
_mm_cmpgt_epi8(input_b, maxv)
);
error = _mm_or_si128(error, tmp);
/************************ next iteration **************************/
prev_input = input_b;
prev_first_len = first_len_b;
data += 32;
len -= 32;
}
if (!_mm_testz_si128(error, error))
return -1;
int32_t token4 = _mm_extract_epi32(prev_input, 3);
const int8_t *token = (const int8_t *)&token4;
int lookahead = 0;
if (token[3] > (int8_t)0xBF)
lookahead = 1;
else if (token[2] > (int8_t)0xBF)
lookahead = 2;
else if (token[1] > (int8_t)0xBF)
lookahead = 3;
data -= lookahead;
len += lookahead;
}
return utf8_naive(data, len);
}
#endif

@ -1,395 +0,0 @@
/*
* http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
*
* Table 3-7. Well-Formed UTF-8 Byte Sequences
*
* +--------------------+------------+-------------+------------+-------------+
* | Code Points | First Byte | Second Byte | Third Byte | Fourth Byte |
* +--------------------+------------+-------------+------------+-------------+
* | U+0000..U+007F | 00..7F | | | |
* +--------------------+------------+-------------+------------+-------------+
* | U+0080..U+07FF | C2..DF | 80..BF | | |
* +--------------------+------------+-------------+------------+-------------+
* | U+0800..U+0FFF | E0 | A0..BF | 80..BF | |
* +--------------------+------------+-------------+------------+-------------+
* | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | |
* +--------------------+------------+-------------+------------+-------------+
* | U+D000..U+D7FF | ED | 80..9F | 80..BF | |
* +--------------------+------------+-------------+------------+-------------+
* | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | |
* +--------------------+------------+-------------+------------+-------------+
* | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF |
* +--------------------+------------+-------------+------------+-------------+
* | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF |
* +--------------------+------------+-------------+------------+-------------+
* | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF |
* +--------------------+------------+-------------+------------+-------------+
*/
/* Return 0 - success, >0 - index(1 based) of first error char */
int utf8_naive(const unsigned char* data, int len) {
int err_pos = 1;
while (len) {
int bytes;
const unsigned char byte1 = data[0];
/* 00..7F */
if (byte1 <= 0x7F) {
bytes = 1;
/* C2..DF, 80..BF */
} else if (len >= 2 && byte1 >= 0xC2 && byte1 <= 0xDF &&
(signed char)data[1] <= (signed char)0xBF) {
bytes = 2;
} else if (len >= 3) {
const unsigned char byte2 = data[1];
/* Is byte2, byte3 between 0x80 ~ 0xBF */
const int byte2_ok = (signed char)byte2 <= (signed char)0xBF;
const int byte3_ok = (signed char)data[2] <= (signed char)0xBF;
if (byte2_ok && byte3_ok &&
/* E0, A0..BF, 80..BF */
((byte1 == 0xE0 && byte2 >= 0xA0) ||
/* E1..EC, 80..BF, 80..BF */
(byte1 >= 0xE1 && byte1 <= 0xEC) ||
/* ED, 80..9F, 80..BF */
(byte1 == 0xED && byte2 <= 0x9F) ||
/* EE..EF, 80..BF, 80..BF */
(byte1 >= 0xEE && byte1 <= 0xEF))) {
bytes = 3;
} else if (len >= 4) {
/* Is byte4 between 0x80 ~ 0xBF */
const int byte4_ok = (signed char)data[3] <= (signed char)0xBF;
if (byte2_ok && byte3_ok && byte4_ok &&
/* F0, 90..BF, 80..BF, 80..BF */
((byte1 == 0xF0 && byte2 >= 0x90) ||
/* F1..F3, 80..BF, 80..BF, 80..BF */
(byte1 >= 0xF1 && byte1 <= 0xF3) ||
/* F4, 80..8F, 80..BF, 80..BF */
(byte1 == 0xF4 && byte2 <= 0x8F))) {
bytes = 4;
} else {
return err_pos;
}
} else {
return err_pos;
}
} else {
return err_pos;
}
len -= bytes;
err_pos += bytes;
data += bytes;
}
return 0;
}
#ifdef __SSE4_1__
#include <stdint.h>
#include <stdio.h>
#include <x86intrin.h>
int utf8_naive(const unsigned char* data, int len);
static const int8_t _first_len_tbl[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3,
};
static const int8_t _first_range_tbl[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8,
};
static const int8_t _range_min_tbl[] = {
0x00, 0x80, 0x80, 0x80, 0xA0, 0x80, 0x90, 0x80,
0xC2, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
};
static const int8_t _range_max_tbl[] = {
0x7F, 0xBF, 0xBF, 0xBF, 0xBF, 0x9F, 0xBF, 0x8F,
0xF4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
};
static const int8_t _df_ee_tbl[] = {
0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0,
};
static const int8_t _ef_fe_tbl[] = {
0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
/* Return 0 on success, -1 on error */
int utf8_range2(const unsigned char* data, int len) {
if (len >= 32) {
__m128i prev_input = _mm_set1_epi8(0);
__m128i prev_first_len = _mm_set1_epi8(0);
const __m128i first_len_tbl =
_mm_loadu_si128((const __m128i*)_first_len_tbl);
const __m128i first_range_tbl =
_mm_loadu_si128((const __m128i*)_first_range_tbl);
const __m128i range_min_tbl =
_mm_loadu_si128((const __m128i*)_range_min_tbl);
const __m128i range_max_tbl =
_mm_loadu_si128((const __m128i*)_range_max_tbl);
const __m128i df_ee_tbl = _mm_loadu_si128((const __m128i*)_df_ee_tbl);
const __m128i ef_fe_tbl = _mm_loadu_si128((const __m128i*)_ef_fe_tbl);
__m128i error = _mm_set1_epi8(0);
while (len >= 32) {
/***************************** block 1 ****************************/
const __m128i input_a = _mm_loadu_si128((const __m128i*)data);
__m128i high_nibbles =
_mm_and_si128(_mm_srli_epi16(input_a, 4), _mm_set1_epi8(0x0F));
__m128i first_len_a = _mm_shuffle_epi8(first_len_tbl, high_nibbles);
__m128i range_a = _mm_shuffle_epi8(first_range_tbl, high_nibbles);
range_a = _mm_or_si128(range_a,
_mm_alignr_epi8(first_len_a, prev_first_len, 15));
__m128i tmp;
tmp = _mm_alignr_epi8(first_len_a, prev_first_len, 14);
tmp = _mm_subs_epu8(tmp, _mm_set1_epi8(1));
range_a = _mm_or_si128(range_a, tmp);
tmp = _mm_alignr_epi8(first_len_a, prev_first_len, 13);
tmp = _mm_subs_epu8(tmp, _mm_set1_epi8(2));
range_a = _mm_or_si128(range_a, tmp);
__m128i shift1, pos, range2;
shift1 = _mm_alignr_epi8(input_a, prev_input, 15);
pos = _mm_sub_epi8(shift1, _mm_set1_epi8(0xEF));
tmp = _mm_subs_epu8(pos, _mm_set1_epi8(0xF0));
range2 = _mm_shuffle_epi8(df_ee_tbl, tmp);
tmp = _mm_adds_epu8(pos, _mm_set1_epi8(0x70));
range2 = _mm_add_epi8(range2, _mm_shuffle_epi8(ef_fe_tbl, tmp));
range_a = _mm_add_epi8(range_a, range2);
__m128i minv = _mm_shuffle_epi8(range_min_tbl, range_a);
__m128i maxv = _mm_shuffle_epi8(range_max_tbl, range_a);
tmp = _mm_or_si128(_mm_cmplt_epi8(input_a, minv),
_mm_cmpgt_epi8(input_a, maxv));
error = _mm_or_si128(error, tmp);
/***************************** block 2 ****************************/
const __m128i input_b = _mm_loadu_si128((const __m128i*)(data + 16));
high_nibbles =
_mm_and_si128(_mm_srli_epi16(input_b, 4), _mm_set1_epi8(0x0F));
__m128i first_len_b = _mm_shuffle_epi8(first_len_tbl, high_nibbles);
__m128i range_b = _mm_shuffle_epi8(first_range_tbl, high_nibbles);
range_b =
_mm_or_si128(range_b, _mm_alignr_epi8(first_len_b, first_len_a, 15));
tmp = _mm_alignr_epi8(first_len_b, first_len_a, 14);
tmp = _mm_subs_epu8(tmp, _mm_set1_epi8(1));
range_b = _mm_or_si128(range_b, tmp);
tmp = _mm_alignr_epi8(first_len_b, first_len_a, 13);
tmp = _mm_subs_epu8(tmp, _mm_set1_epi8(2));
range_b = _mm_or_si128(range_b, tmp);
shift1 = _mm_alignr_epi8(input_b, input_a, 15);
pos = _mm_sub_epi8(shift1, _mm_set1_epi8(0xEF));
tmp = _mm_subs_epu8(pos, _mm_set1_epi8(0xF0));
range2 = _mm_shuffle_epi8(df_ee_tbl, tmp);
tmp = _mm_adds_epu8(pos, _mm_set1_epi8(0x70));
range2 = _mm_add_epi8(range2, _mm_shuffle_epi8(ef_fe_tbl, tmp));
range_b = _mm_add_epi8(range_b, range2);
minv = _mm_shuffle_epi8(range_min_tbl, range_b);
maxv = _mm_shuffle_epi8(range_max_tbl, range_b);
tmp = _mm_or_si128(_mm_cmplt_epi8(input_b, minv),
_mm_cmpgt_epi8(input_b, maxv));
error = _mm_or_si128(error, tmp);
/************************ next iteration **************************/
prev_input = input_b;
prev_first_len = first_len_b;
data += 32;
len -= 32;
}
if (!_mm_testz_si128(error, error)) return -1;
int32_t token4 = _mm_extract_epi32(prev_input, 3);
const int8_t* token = (const int8_t*)&token4;
int lookahead = 0;
if (token[3] > (int8_t)0xBF)
lookahead = 1;
else if (token[2] > (int8_t)0xBF)
lookahead = 2;
else if (token[1] > (int8_t)0xBF)
lookahead = 3;
data -= lookahead;
len += lookahead;
}
return utf8_naive(data, len);
}
#endif
#ifdef __ARM_NEON
#include <arm_neon.h>
#include <stdint.h>
#include <stdio.h>
int utf8_naive(const unsigned char* data, int len);
static const uint8_t _first_len_tbl[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3,
};
static const uint8_t _first_range_tbl[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8,
};
static const uint8_t _range_min_tbl[] = {
0x00, 0x80, 0x80, 0x80, 0xA0, 0x80, 0x90, 0x80,
0xC2, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
};
static const uint8_t _range_max_tbl[] = {
0x7F, 0xBF, 0xBF, 0xBF, 0xBF, 0x9F, 0xBF, 0x8F,
0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
};
static const uint8_t _range_adjust_tbl[] = {
2, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0,
};
/* Return 0 on success, -1 on error */
int utf8_range2(const unsigned char* data, int len) {
if (len >= 32) {
uint8x16_t prev_input = vdupq_n_u8(0);
uint8x16_t prev_first_len = vdupq_n_u8(0);
const uint8x16_t first_len_tbl = vld1q_u8(_first_len_tbl);
const uint8x16_t first_range_tbl = vld1q_u8(_first_range_tbl);
const uint8x16_t range_min_tbl = vld1q_u8(_range_min_tbl);
const uint8x16_t range_max_tbl = vld1q_u8(_range_max_tbl);
const uint8x16x2_t range_adjust_tbl = vld2q_u8(_range_adjust_tbl);
const uint8x16_t const_1 = vdupq_n_u8(1);
const uint8x16_t const_2 = vdupq_n_u8(2);
const uint8x16_t const_e0 = vdupq_n_u8(0xE0);
uint8x16_t error1 = vdupq_n_u8(0);
uint8x16_t error2 = vdupq_n_u8(0);
uint8x16_t error3 = vdupq_n_u8(0);
uint8x16_t error4 = vdupq_n_u8(0);
while (len >= 32) {
/******************* two blocks interleaved **********************/
#if defined(__GNUC__) && !defined(__clang__) && (__GNUC__ < 8)
/* gcc doesn't support vldq1_u8_x2 until version 8 */
const uint8x16_t input_a = vld1q_u8(data);
const uint8x16_t input_b = vld1q_u8(data + 16);
#else
/* Forces a double load on Clang */
const uint8x16x2_t input_pair = vld1q_u8_x2(data);
const uint8x16_t input_a = input_pair.val[0];
const uint8x16_t input_b = input_pair.val[1];
#endif
const uint8x16_t high_nibbles_a = vshrq_n_u8(input_a, 4);
const uint8x16_t high_nibbles_b = vshrq_n_u8(input_b, 4);
const uint8x16_t first_len_a = vqtbl1q_u8(first_len_tbl, high_nibbles_a);
const uint8x16_t first_len_b = vqtbl1q_u8(first_len_tbl, high_nibbles_b);
uint8x16_t range_a = vqtbl1q_u8(first_range_tbl, high_nibbles_a);
uint8x16_t range_b = vqtbl1q_u8(first_range_tbl, high_nibbles_b);
range_a = vorrq_u8(range_a, vextq_u8(prev_first_len, first_len_a, 15));
range_b = vorrq_u8(range_b, vextq_u8(first_len_a, first_len_b, 15));
uint8x16_t tmp1_a, tmp2_a, tmp1_b, tmp2_b;
tmp1_a = vextq_u8(prev_first_len, first_len_a, 14);
tmp1_a = vqsubq_u8(tmp1_a, const_1);
range_a = vorrq_u8(range_a, tmp1_a);
tmp1_b = vextq_u8(first_len_a, first_len_b, 14);
tmp1_b = vqsubq_u8(tmp1_b, const_1);
range_b = vorrq_u8(range_b, tmp1_b);
tmp2_a = vextq_u8(prev_first_len, first_len_a, 13);
tmp2_a = vqsubq_u8(tmp2_a, const_2);
range_a = vorrq_u8(range_a, tmp2_a);
tmp2_b = vextq_u8(first_len_a, first_len_b, 13);
tmp2_b = vqsubq_u8(tmp2_b, const_2);
range_b = vorrq_u8(range_b, tmp2_b);
uint8x16_t shift1_a = vextq_u8(prev_input, input_a, 15);
uint8x16_t pos_a = vsubq_u8(shift1_a, const_e0);
range_a = vaddq_u8(range_a, vqtbl2q_u8(range_adjust_tbl, pos_a));
uint8x16_t shift1_b = vextq_u8(input_a, input_b, 15);
uint8x16_t pos_b = vsubq_u8(shift1_b, const_e0);
range_b = vaddq_u8(range_b, vqtbl2q_u8(range_adjust_tbl, pos_b));
uint8x16_t minv_a = vqtbl1q_u8(range_min_tbl, range_a);
uint8x16_t maxv_a = vqtbl1q_u8(range_max_tbl, range_a);
uint8x16_t minv_b = vqtbl1q_u8(range_min_tbl, range_b);
uint8x16_t maxv_b = vqtbl1q_u8(range_max_tbl, range_b);
error1 = vorrq_u8(error1, vcltq_u8(input_a, minv_a));
error2 = vorrq_u8(error2, vcgtq_u8(input_a, maxv_a));
error3 = vorrq_u8(error3, vcltq_u8(input_b, minv_b));
error4 = vorrq_u8(error4, vcgtq_u8(input_b, maxv_b));
/************************ next iteration *************************/
prev_input = input_b;
prev_first_len = first_len_b;
data += 32;
len -= 32;
}
error1 = vorrq_u8(error1, error2);
error1 = vorrq_u8(error1, error3);
error1 = vorrq_u8(error1, error4);
if (vmaxvq_u8(error1)) return -1;
uint32_t token4;
vst1q_lane_u32(&token4, vreinterpretq_u32_u8(prev_input), 3);
const int8_t* token = (const int8_t*)&token4;
int lookahead = 0;
if (token[3] > (int8_t)0xBF)
lookahead = 1;
else if (token[2] > (int8_t)0xBF)
lookahead = 2;
else if (token[1] > (int8_t)0xBF)
lookahead = 3;
data -= lookahead;
len += lookahead;
}
return utf8_naive(data, len);
}
#endif

@ -1,5 +1,5 @@
#if defined(__ARM_NEON) || defined(__SSE4_1__) #if (defined(__ARM_NEON) && defined(__aarch64__)) || defined(__SSE4_1__)
int utf8_range2(const unsigned char* data, int len); int utf8_range2(const unsigned char* data, int len);
#else #else
int utf8_naive(const unsigned char* data, int len); int utf8_naive(const unsigned char* data, int len);

@ -1,5 +0,0 @@
This directory contains code that interfaces upb with external C/C++
libraries. Right now this is:
* upb/bindings/lua:
a Lua extension that exposes upb to Lua programs via the Lua C API.

@ -28,6 +28,10 @@ load(
"UPB_DEFAULT_COPTS", "UPB_DEFAULT_COPTS",
"UPB_DEFAULT_CPPOPTS", "UPB_DEFAULT_CPPOPTS",
) )
load(
"//upb/bindings/lua:lua_proto_library.bzl",
"lua_proto_library",
)
licenses(["notice"]) licenses(["notice"])
@ -64,3 +68,62 @@ cc_binary(
) )
exports_files(["upb.lua"]) exports_files(["upb.lua"])
cc_test(
name = "test_lua",
srcs = ["main.c"],
copts = UPB_DEFAULT_COPTS,
data = [
"test_upb.lua",
":descriptor_proto_lua",
":empty_proto_lua",
":test_messages_proto2_proto_lua",
":test_messages_proto3_proto_lua",
":test_proto_lua",
"//:third_party/lunit/console.lua",
"//:third_party/lunit/lunit.lua",
"//upb/bindings/lua:upb.lua",
"@com_google_protobuf//:conformance_proto",
"@com_google_protobuf//:descriptor_proto",
],
linkstatic = 1,
deps = [
"//upb/bindings/lua:lupb",
"@lua//:liblua",
],
)
proto_library(
name = "test_proto",
testonly = 1,
srcs = ["test.proto"],
deps = ["@com_google_protobuf//:timestamp_proto"],
)
lua_proto_library(
name = "test_proto_lua",
testonly = 1,
deps = [":test_proto"],
)
lua_proto_library(
name = "descriptor_proto_lua",
deps = ["@com_google_protobuf//:descriptor_proto"],
)
lua_proto_library(
name = "empty_proto_lua",
deps = ["@com_google_protobuf//:empty_proto"],
)
lua_proto_library(
name = "test_messages_proto3_proto_lua",
testonly = 1,
deps = ["@com_google_protobuf//:test_messages_proto3_proto"],
)
lua_proto_library(
name = "test_messages_proto2_proto_lua",
testonly = 1,
deps = ["@com_google_protobuf//:test_messages_proto2_proto"],
)

@ -0,0 +1,8 @@
# upb Lua bindings
These are some bare-bones upb bindings for Lua.
These bindings exist primarily for experimentation and testing. They are
incomplete and are not really intended for use in any application. This is by
no means a complete or supported protobuf library.

@ -70,7 +70,7 @@ int main(int argc, char** argv) {
signal(SIGINT, sighandler); signal(SIGINT, sighandler);
ret = ret || lua_pcall(L, 1, LUA_MULTRET, 0) || ret = ret || lua_pcall(L, 1, LUA_MULTRET, 0) ||
luaL_dofile(L, "tests/bindings/lua/test_upb.lua"); luaL_dofile(L, "upb/bindings/lua/test_upb.lua");
signal(SIGINT, SIG_DFL); signal(SIGINT, SIG_DFL);
if (ret) { if (ret) {

@ -29,7 +29,7 @@
local upb = require "lupb" local upb = require "lupb"
local lunit = require "lunit" local lunit = require "lunit"
local upb_test = require "tests.bindings.lua.test_pb" local upb_test = require "upb.bindings.lua.test_pb"
local test_messages_proto3 = require "google.protobuf.test_messages_proto3_pb" local test_messages_proto3 = require "google.protobuf.test_messages_proto3_pb"
local test_messages_proto2 = require "google.protobuf.test_messages_proto2_pb" local test_messages_proto2 = require "google.protobuf.test_messages_proto2_pb"
local descriptor = require "google.protobuf.descriptor_pb" local descriptor = require "google.protobuf.descriptor_pb"

@ -0,0 +1 @@
# No conformance tests are failing, thus the failure list is empty.

@ -38,12 +38,12 @@
#include "google/protobuf/timestamp.upb.h" #include "google/protobuf/timestamp.upb.h"
#include "google/protobuf/timestamp.upbdefs.h" #include "google/protobuf/timestamp.upbdefs.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "tests/test_cpp.upb.h"
#include "tests/test_cpp.upbdefs.h"
#include "upb/def.h" #include "upb/def.h"
#include "upb/def.hpp" #include "upb/def.hpp"
#include "upb/json_decode.h" #include "upb/json_decode.h"
#include "upb/json_encode.h" #include "upb/json_encode.h"
#include "upb/test_cpp.upb.h"
#include "upb/test_cpp.upbdefs.h"
#include "upb/upb.h" #include "upb/upb.h"
// Must be last. // Must be last.

@ -32,7 +32,7 @@
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "src/google/protobuf/test_messages_proto3.upb.h" #include "src/google/protobuf/test_messages_proto3.upb.h"
#include "tests/test.upb.h" #include "upb/test.upb.h"
#include "upb/upb.hpp" #include "upb/upb.hpp"
#define MIN(x, y) ((x) < (y) ? (x) : (y)) #define MIN(x, y) ((x) < (y) ? (x) : (y))
Loading…
Cancel
Save