Implementing string interning

pull/288/head
Ayende Rahien 15 years ago
parent e3aff478fd
commit 162b656fdd
  1. 56
      src/ProtocolBuffers/ByteBuffer.cs
  2. 131
      src/ProtocolBuffers/ByteStringStringInterning.cs
  3. 48
      src/ProtocolBuffers/CodedInputStream.cs
  4. 266
      src/ProtocolBuffers/ProtocolBuffers.csproj

@ -0,0 +1,56 @@
using System;
namespace Google.ProtocolBuffers
{
public class ByteBuffer
{
public byte[] Buffer;
public int Offset;
public int Length;
private int hash;
public void ResetHash()
{
hash = 23;
for (var i = Offset; i < Offset + Length; i++)
{
hash = (hash * 23) ^ Buffer[i];
}
}
public ByteBuffer(byte[] buffer, int offset, int length)
{
Buffer = buffer;
Offset = offset;
Length = length;
ResetHash();
}
public ByteString ToByteString()
{
return ByteString.CopyFrom(Buffer, Offset, Length);
}
public override int GetHashCode()
{
return hash;
}
public override bool Equals(object obj)
{
var other = obj as ByteBuffer;
if (other == null)
return false;
if (other.Offset != Offset)
return false;
if (other.Length != Length)
return false;
for (int i = Offset; i < Offset + Length; i++)
{
if (Buffer[i] != other.Buffer[i])
return false;
}
return true;
}
}
}

@ -0,0 +1,131 @@
using System;
using System.Collections.Generic;
using System.Text;
using System.Threading;
namespace Google.ProtocolBuffers
{
/// <summary>
/// This class tries hard to allow us to generate strings directly from buffer outputs without having to
///
/// Note, non thread safe
/// </summary>
public class ByteStringStringInterning
{
private class ByteStringOrByteBuffer : IEquatable<ByteStringOrByteBuffer>
{
private readonly ByteString str;
private readonly ByteBuffer buffer;
public ByteStringOrByteBuffer(ByteString str)
{
this.str = str;
}
public ByteStringOrByteBuffer(ByteBuffer buffer)
{
this.buffer = buffer;
}
public bool Equals(ByteStringOrByteBuffer other)
{
if (ReferenceEquals(null, other)) return false;
if (ReferenceEquals(this, other)) return true;
if(other.str!=null && str != null)
return Equals(other.str, str);
if (other.buffer != null && buffer != null)
return Equals(other.buffer, buffer);
if (other.str != null && str == null)
return StringEqualsToBuffer(other.str, buffer);
return StringEqualsToBuffer(str, other.buffer);
}
private static bool StringEqualsToBuffer(ByteString byteString, ByteBuffer byteBuffer)
{
var strLen = byteString.Length;
if(strLen != byteBuffer.Length)
return false;
for (int i = 0; i < strLen; i++)
{
if(byteString.bytes[i] != byteBuffer.Buffer[byteBuffer.Offset+i])
return false;
}
return true;
}
public override bool Equals(object obj)
{
if (ReferenceEquals(null, obj)) return false;
if (ReferenceEquals(this, obj)) return true;
return Equals(obj as ByteStringOrByteBuffer);
}
public override int GetHashCode()
{
return str != null ? str.GetHashCode() : buffer.GetHashCode();
}
}
private readonly int limit;
private int timestamp;
private readonly IDictionary<ByteStringOrByteBuffer, Data> strings = new Dictionary<ByteStringOrByteBuffer, Data>();
public static ByteStringStringInterning CreateInstance()
{
return new ByteStringStringInterning(65536);
}
[Serializable]
private class Data
{
public string Value;
public int Timestamp;
}
private ByteStringStringInterning(int limit)
{
this.limit = limit;
}
public void Clear()
{
strings.Clear();
}
public string Intern(ByteBuffer str)
{
Data val;
int currentTimestamp = Interlocked.Increment(ref timestamp);
if (strings.TryGetValue(new ByteStringOrByteBuffer(str), out val))
{
Interlocked.Exchange(ref val.Timestamp, currentTimestamp);
return val.Value;
}
var byteString = str.ToByteString();
val = new Data { Timestamp = currentTimestamp, Value = byteString.ToStringUtf8() };
strings.Add(new ByteStringOrByteBuffer(byteString), val);
DoCleanupIfNeeded();
return val.Value;
}
private void DoCleanupIfNeeded()
{
if (strings.Count <= limit)
return;
// to avoid frequent thrashing, we will remove the bottom 10% of the current pool in one go
// that means that we will hit the limit fairly infrequently
var list = new List<KeyValuePair<ByteStringOrByteBuffer, Data>>(strings);
list.Sort((x, y) => x.Value.Timestamp - y.Value.Timestamp);
for (int i = 0; i < limit/10; i++)
{
strings.Remove(list[i].Key);
}
}
}
}

@ -62,7 +62,10 @@ namespace Google.ProtocolBuffers {
private int bufferSizeAfterLimit = 0;
private int bufferPos = 0;
private readonly Stream input;
private uint lastTag = 0;
private uint lastTag = 0;
private readonly ByteBuffer rawBytesBuffer = new ByteBuffer(new byte[BufferSize], 0, 0);
private readonly ByteStringStringInterning byteStringStringInterning = ByteStringStringInterning.CreateInstance();
internal const int DefaultRecursionLimit = 64;
internal const int DefaultSizeLimit = 64 << 20; // 64MB
@ -237,13 +240,13 @@ namespace Google.ProtocolBuffers {
}
if (size <= bufferSize - bufferPos) {
// Fast path: We already have the bytes in a contiguous buffer, so
// just copy directly from it.
String result = Encoding.UTF8.GetString(buffer, bufferPos, size);
// just copy directly from it.
String result = byteStringStringInterning.Intern(new ByteBuffer(buffer, bufferPos, size));
bufferPos += size;
return result;
}
// Slow path: Build a byte array first then copy it.
return Encoding.UTF8.GetString(ReadRawBytes(size), 0, size);
// Slow path: Build a byte array first then copy it.
return byteStringStringInterning.Intern(ReadRawBytes(size));
}
/// <summary>
@ -302,8 +305,9 @@ namespace Google.ProtocolBuffers {
bufferPos += size;
return result;
} else {
// Slow path: Build a byte array first then copy it.
return ByteString.CopyFrom(ReadRawBytes(size));
// Slow path: Build a byte array first then copy it.
ByteBuffer rawBytes = ReadRawBytes(size);
return ByteString.CopyFrom(rawBytes.Buffer, rawBytes.Offset, rawBytes.Length);
}
}
@ -763,7 +767,7 @@ namespace Google.ProtocolBuffers {
/// <exception cref="InvalidProtocolBufferException">
/// the end of the stream or the current limit was reached
/// </exception>
public byte[] ReadRawBytes(int size) {
public ByteBuffer ReadRawBytes(int size) {
if (size < 0) {
throw InvalidProtocolBufferException.NegativeSize();
}
@ -776,19 +780,19 @@ namespace Google.ProtocolBuffers {
}
if (size <= bufferSize - bufferPos) {
// We have all the bytes we need already.
byte[] bytes = new byte[size];
Array.Copy(buffer, bufferPos, bytes, 0, size);
bufferPos += size;
return bytes;
// We have all the bytes we need already.
var result = new ByteBuffer(buffer, bufferPos, size);
bufferPos += size;
return result;
} else if (size < BufferSize) {
// Reading more bytes than are in the buffer, but not an excessive number
// of bytes. We can safely allocate the resulting array ahead of time.
// First copy what we have.
byte[] bytes = new byte[size];
// First copy what we have.
rawBytesBuffer.Length = size;
rawBytesBuffer.Offset = 0;
int pos = bufferSize - bufferPos;
Array.Copy(buffer, bufferPos, bytes, 0, pos);
Array.Copy(buffer, bufferPos, rawBytesBuffer.Buffer, 0, pos);
bufferPos = bufferSize;
// We want to use RefillBuffer() and then copy from the buffer into our
@ -797,16 +801,16 @@ namespace Google.ProtocolBuffers {
RefillBuffer(true);
while (size - pos > bufferSize) {
Array.Copy(buffer, 0, bytes, pos, bufferSize);
Array.Copy(buffer, 0, rawBytesBuffer.Buffer, pos, bufferSize);
pos += bufferSize;
bufferPos = bufferSize;
RefillBuffer(true);
}
Array.Copy(buffer, 0, bytes, pos, size - pos);
bufferPos = size - pos;
return bytes;
Array.Copy(buffer, 0, rawBytesBuffer.Buffer, pos, size - pos);
bufferPos = size - pos;
rawBytesBuffer.ResetHash();
return rawBytesBuffer;
} else {
// The size is very large. For security reasons, we can't allocate the
// entire byte array yet. The size comes directly from the input, so a
@ -859,7 +863,7 @@ namespace Google.ProtocolBuffers {
}
// Done.
return bytes;
return new ByteBuffer(buffer, 0, size);
}
}

@ -1,139 +1,141 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="3.5" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
<ProductVersion>9.0.30729</ProductVersion>
<SchemaVersion>2.0</SchemaVersion>
<ProjectGuid>{6908BDCE-D925-43F3-94AC-A531E6DF2591}</ProjectGuid>
<OutputType>Library</OutputType>
<AppDesignerFolder>Properties</AppDesignerFolder>
<RootNamespace>Google.ProtocolBuffers</RootNamespace>
<AssemblyName>Google.ProtocolBuffers</AssemblyName>
<TargetFrameworkVersion>v2.0</TargetFrameworkVersion>
<FileAlignment>512</FileAlignment>
<SignAssembly>true</SignAssembly>
<AssemblyOriginatorKeyFile>Properties\Google.ProtocolBuffers.snk</AssemblyOriginatorKeyFile>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
<DebugSymbols>true</DebugSymbols>
<DebugType>full</DebugType>
<Optimize>false</Optimize>
<OutputPath>bin\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
<NoStdLib>true</NoStdLib>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
<DebugType>pdbonly</DebugType>
<Optimize>true</Optimize>
<OutputPath>bin\Release\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
<NoStdLib>true</NoStdLib>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Silverlight2|AnyCPU' ">
<OutputPath>bin\Silverlight2\</OutputPath>
<DefineConstants>TRACE;SILVERLIGHT2</DefineConstants>
<Optimize>true</Optimize>
<NoStdLib>true</NoStdLib>
<DebugType>pdbonly</DebugType>
<PlatformTarget>AnyCPU</PlatformTarget>
<ErrorReport>prompt</ErrorReport>
</PropertyGroup>
<ItemGroup>
<Reference Include="mscorlib" />
<Reference Include="System" />
</ItemGroup>
<ItemGroup>
<Compile Include="AbstractBuilder.cs" />
<Compile Include="AbstractMessage.cs" />
<Compile Include="ByteString.cs" />
<Compile Include="Collections\Enumerables.cs" />
<Compile Include="Collections\IPopsicleList.cs" />
<Compile Include="Collections\PopsicleList.cs" />
<Compile Include="Delegates.cs" />
<Compile Include="CodedInputStream.cs" />
<Compile Include="CodedOutputStream.cs" />
<Compile Include="Collections\Dictionaries.cs" />
<Compile Include="Collections\Lists.cs" />
<Compile Include="Collections\ReadOnlyDictionary.cs" />
<Compile Include="DescriptorProtos\CSharpOptions.cs" />
<Compile Include="DescriptorProtos\DescriptorProtoFile.cs" />
<Compile Include="DescriptorProtos\IDescriptorProto.cs" />
<Compile Include="DescriptorProtos\PartialClasses.cs" />
<Compile Include="Descriptors\DescriptorBase.cs" />
<Compile Include="Descriptors\DescriptorPool.cs" />
<Compile Include="Descriptors\DescriptorUtil.cs" />
<Compile Include="Descriptors\DescriptorValidationException.cs" />
<Compile Include="Descriptors\EnumDescriptor.cs" />
<Compile Include="Descriptors\EnumValueDescriptor.cs" />
<Compile Include="Descriptors\FieldDescriptor.cs" />
<Compile Include="Descriptors\FieldMappingAttribute.cs" />
<Compile Include="Descriptors\FieldType.cs" />
<Compile Include="Descriptors\FileDescriptor.cs" />
<Compile Include="Descriptors\IDescriptor.cs" />
<Compile Include="Descriptors\IndexedDescriptorBase.cs" />
<Compile Include="Descriptors\MappedType.cs" />
<Compile Include="Descriptors\MessageDescriptor.cs" />
<Compile Include="Descriptors\MethodDescriptor.cs" />
<Compile Include="Descriptors\PackageDescriptor.cs" />
<Compile Include="Descriptors\ServiceDescriptor.cs" />
<Compile Include="DynamicMessage.cs" />
<Compile Include="ExtendableBuilder.cs" />
<Compile Include="ExtendableMessage.cs" />
<Compile Include="ExtensionInfo.cs" />
<Compile Include="ExtensionRegistry.cs" />
<Compile Include="FieldAccess\ReflectionUtil.cs" />
<Compile Include="FieldAccess\SingleEnumAccessor.cs" />
<Compile Include="FieldAccess\SingleMessageAccessor.cs" />
<Compile Include="FieldAccess\SinglePrimitiveAccessor.cs" />
<Compile Include="FieldAccess\RepeatedPrimitiveAccessor.cs" />
<Compile Include="FieldAccess\RepeatedEnumAccessor.cs" />
<Compile Include="FieldAccess\IFieldAccessor.cs" />
<Compile Include="FieldAccess\FieldAccessorTable.cs" />
<Compile Include="FieldAccess\RepeatedMessageAccessor.cs" />
<Compile Include="FieldSet.cs" />
<Compile Include="GeneratedBuilder.cs" />
<Compile Include="GeneratedRepeatExtension.cs" />
<Compile Include="GeneratedSingleExtension.cs" />
<Compile Include="GeneratedMessage.cs" />
<Compile Include="IBuilder.cs" />
<Compile Include="GeneratedExtensionBase.cs" />
<Compile Include="IMessage.cs" />
<Compile Include="InvalidProtocolBufferException.cs" />
<Compile Include="IRpcChannel.cs" />
<Compile Include="IRpcController.cs" />
<Compile Include="IService.cs" />
<Compile Include="MessageStreamIterator.cs" />
<Compile Include="MessageStreamWriter.cs" />
<Compile Include="MessageUtil.cs" />
<Compile Include="NameHelpers.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
<Compile Include="RpcUtil.cs" />
<Compile Include="SilverlightCompatibility.cs" />
<Compile Include="SortedList.cs" />
<Compile Include="TextFormat.cs" />
<Compile Include="TextGenerator.cs" />
<Compile Include="TextTokenizer.cs" />
<Compile Include="ThrowHelper.cs" />
<Compile Include="UninitializedMessageException.cs" />
<Compile Include="UnknownField.cs" />
<Compile Include="UnknownFieldSet.cs" />
<Compile Include="WireFormat.cs" />
</ItemGroup>
<ItemGroup>
<None Include="Properties\Google.ProtocolBuffers.snk" />
</ItemGroup>
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" Condition=" '$(Configuration)' != 'Silverlight2' " />
<Import Project="$(MSBuildExtensionsPath)\Microsoft\Silverlight\v2.0\Microsoft.Silverlight.CSharp.targets" Condition=" '$(Configuration)' == 'Silverlight2' " />
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="3.5" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
<ProductVersion>9.0.30729</ProductVersion>
<SchemaVersion>2.0</SchemaVersion>
<ProjectGuid>{6908BDCE-D925-43F3-94AC-A531E6DF2591}</ProjectGuid>
<OutputType>Library</OutputType>
<AppDesignerFolder>Properties</AppDesignerFolder>
<RootNamespace>Google.ProtocolBuffers</RootNamespace>
<AssemblyName>Google.ProtocolBuffers</AssemblyName>
<TargetFrameworkVersion>v2.0</TargetFrameworkVersion>
<FileAlignment>512</FileAlignment>
<SignAssembly>true</SignAssembly>
<AssemblyOriginatorKeyFile>Properties\Google.ProtocolBuffers.snk</AssemblyOriginatorKeyFile>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
<DebugSymbols>true</DebugSymbols>
<DebugType>full</DebugType>
<Optimize>false</Optimize>
<OutputPath>bin\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
<NoStdLib>true</NoStdLib>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
<DebugType>pdbonly</DebugType>
<Optimize>true</Optimize>
<OutputPath>bin\Release\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
<NoStdLib>true</NoStdLib>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Silverlight2|AnyCPU' ">
<OutputPath>bin\Silverlight2\</OutputPath>
<DefineConstants>TRACE;SILVERLIGHT2</DefineConstants>
<Optimize>true</Optimize>
<NoStdLib>true</NoStdLib>
<DebugType>pdbonly</DebugType>
<PlatformTarget>AnyCPU</PlatformTarget>
<ErrorReport>prompt</ErrorReport>
</PropertyGroup>
<ItemGroup>
<Reference Include="mscorlib" />
<Reference Include="System" />
</ItemGroup>
<ItemGroup>
<Compile Include="AbstractBuilder.cs" />
<Compile Include="AbstractMessage.cs" />
<Compile Include="ByteBuffer.cs" />
<Compile Include="ByteString.cs" />
<Compile Include="ByteStringStringInterning.cs" />
<Compile Include="Collections\Enumerables.cs" />
<Compile Include="Collections\IPopsicleList.cs" />
<Compile Include="Collections\PopsicleList.cs" />
<Compile Include="Delegates.cs" />
<Compile Include="CodedInputStream.cs" />
<Compile Include="CodedOutputStream.cs" />
<Compile Include="Collections\Dictionaries.cs" />
<Compile Include="Collections\Lists.cs" />
<Compile Include="Collections\ReadOnlyDictionary.cs" />
<Compile Include="DescriptorProtos\CSharpOptions.cs" />
<Compile Include="DescriptorProtos\DescriptorProtoFile.cs" />
<Compile Include="DescriptorProtos\IDescriptorProto.cs" />
<Compile Include="DescriptorProtos\PartialClasses.cs" />
<Compile Include="Descriptors\DescriptorBase.cs" />
<Compile Include="Descriptors\DescriptorPool.cs" />
<Compile Include="Descriptors\DescriptorUtil.cs" />
<Compile Include="Descriptors\DescriptorValidationException.cs" />
<Compile Include="Descriptors\EnumDescriptor.cs" />
<Compile Include="Descriptors\EnumValueDescriptor.cs" />
<Compile Include="Descriptors\FieldDescriptor.cs" />
<Compile Include="Descriptors\FieldMappingAttribute.cs" />
<Compile Include="Descriptors\FieldType.cs" />
<Compile Include="Descriptors\FileDescriptor.cs" />
<Compile Include="Descriptors\IDescriptor.cs" />
<Compile Include="Descriptors\IndexedDescriptorBase.cs" />
<Compile Include="Descriptors\MappedType.cs" />
<Compile Include="Descriptors\MessageDescriptor.cs" />
<Compile Include="Descriptors\MethodDescriptor.cs" />
<Compile Include="Descriptors\PackageDescriptor.cs" />
<Compile Include="Descriptors\ServiceDescriptor.cs" />
<Compile Include="DynamicMessage.cs" />
<Compile Include="ExtendableBuilder.cs" />
<Compile Include="ExtendableMessage.cs" />
<Compile Include="ExtensionInfo.cs" />
<Compile Include="ExtensionRegistry.cs" />
<Compile Include="FieldAccess\ReflectionUtil.cs" />
<Compile Include="FieldAccess\SingleEnumAccessor.cs" />
<Compile Include="FieldAccess\SingleMessageAccessor.cs" />
<Compile Include="FieldAccess\SinglePrimitiveAccessor.cs" />
<Compile Include="FieldAccess\RepeatedPrimitiveAccessor.cs" />
<Compile Include="FieldAccess\RepeatedEnumAccessor.cs" />
<Compile Include="FieldAccess\IFieldAccessor.cs" />
<Compile Include="FieldAccess\FieldAccessorTable.cs" />
<Compile Include="FieldAccess\RepeatedMessageAccessor.cs" />
<Compile Include="FieldSet.cs" />
<Compile Include="GeneratedBuilder.cs" />
<Compile Include="GeneratedRepeatExtension.cs" />
<Compile Include="GeneratedSingleExtension.cs" />
<Compile Include="GeneratedMessage.cs" />
<Compile Include="IBuilder.cs" />
<Compile Include="GeneratedExtensionBase.cs" />
<Compile Include="IMessage.cs" />
<Compile Include="InvalidProtocolBufferException.cs" />
<Compile Include="IRpcChannel.cs" />
<Compile Include="IRpcController.cs" />
<Compile Include="IService.cs" />
<Compile Include="MessageStreamIterator.cs" />
<Compile Include="MessageStreamWriter.cs" />
<Compile Include="MessageUtil.cs" />
<Compile Include="NameHelpers.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
<Compile Include="RpcUtil.cs" />
<Compile Include="SilverlightCompatibility.cs" />
<Compile Include="SortedList.cs" />
<Compile Include="TextFormat.cs" />
<Compile Include="TextGenerator.cs" />
<Compile Include="TextTokenizer.cs" />
<Compile Include="ThrowHelper.cs" />
<Compile Include="UninitializedMessageException.cs" />
<Compile Include="UnknownField.cs" />
<Compile Include="UnknownFieldSet.cs" />
<Compile Include="WireFormat.cs" />
</ItemGroup>
<ItemGroup>
<None Include="Properties\Google.ProtocolBuffers.snk" />
</ItemGroup>
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" Condition=" '$(Configuration)' != 'Silverlight2' " />
<Import Project="$(MSBuildExtensionsPath)\Microsoft\Silverlight\v2.0\Microsoft.Silverlight.CSharp.targets" Condition=" '$(Configuration)' == 'Silverlight2' " />
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
Other similar extension points exist, see Microsoft.Common.targets.
<Target Name="BeforeBuild">
</Target>
<Target Name="AfterBuild">
</Target>
-->
-->
</Project>
Loading…
Cancel
Save