mirror of https://github.com/madler/zlib.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
408 lines
10 KiB
408 lines
10 KiB
|
|
; match.asm -- Pentium-Pro optimized version of longest_match() |
|
; |
|
; Updated for zlib 1.1.3 and converted to MASM 6.1x |
|
; Copyright (C) 2000 Dan Higdon <hdan@kinesoft.com> |
|
; and Chuck Walbourn <chuckw@kinesoft.com> |
|
; Corrections by Cosmin Truta <cosmint@cs.ubbcluj.ro> |
|
; |
|
; This is free software; you can redistribute it and/or modify it |
|
; under the terms of the GNU General Public License. |
|
|
|
; Based on match.S |
|
; Written for zlib 1.1.2 |
|
; Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com> |
|
|
|
.686P |
|
.MODEL FLAT |
|
|
|
;=========================================================================== |
|
; EQUATES |
|
;=========================================================================== |
|
|
|
MAX_MATCH EQU 258 |
|
MIN_MATCH EQU 3 |
|
MIN_LOOKAHEAD EQU (MAX_MATCH + MIN_MATCH + 1) |
|
MAX_MATCH_8 EQU ((MAX_MATCH + 7) AND (NOT 7)) |
|
|
|
;=========================================================================== |
|
; STRUCTURES |
|
;=========================================================================== |
|
|
|
; This STRUCT assumes a 4-byte alignment |
|
|
|
DEFLATE_STATE STRUCT |
|
ds_strm dd ? |
|
ds_status dd ? |
|
ds_pending_buf dd ? |
|
ds_pending_buf_size dd ? |
|
ds_pending_out dd ? |
|
ds_pending dd ? |
|
ds_wrap dd ? |
|
ds_data_type db ? |
|
ds_method db ? |
|
db ? ; padding |
|
db ? ; padding |
|
ds_last_flush dd ? |
|
ds_w_size dd ? ; used |
|
ds_w_bits dd ? |
|
ds_w_mask dd ? ; used |
|
ds_window dd ? ; used |
|
ds_window_size dd ? |
|
ds_prev dd ? ; used |
|
ds_head dd ? |
|
ds_ins_h dd ? |
|
ds_hash_size dd ? |
|
ds_hash_bits dd ? |
|
ds_hash_mask dd ? |
|
ds_hash_shift dd ? |
|
ds_block_start dd ? |
|
ds_match_length dd ? ; used |
|
ds_prev_match dd ? ; used |
|
ds_match_available dd ? |
|
ds_strstart dd ? ; used |
|
ds_match_start dd ? ; used |
|
ds_lookahead dd ? ; used |
|
ds_prev_length dd ? ; used |
|
ds_max_chain_length dd ? ; used |
|
ds_max_laxy_match dd ? |
|
ds_level dd ? |
|
ds_strategy dd ? |
|
ds_good_match dd ? ; used |
|
ds_nice_match dd ? ; used |
|
|
|
; Don't need anymore of the struct for match |
|
DEFLATE_STATE ENDS |
|
|
|
;=========================================================================== |
|
; CODE |
|
;=========================================================================== |
|
_TEXT SEGMENT |
|
|
|
;--------------------------------------------------------------------------- |
|
; match_init |
|
;--------------------------------------------------------------------------- |
|
ALIGN 4 |
|
PUBLIC _match_init |
|
_match_init PROC |
|
; no initialization needed |
|
ret |
|
_match_init ENDP |
|
|
|
;--------------------------------------------------------------------------- |
|
; uInt longest_match(deflate_state *deflatestate, IPos curmatch) |
|
;--------------------------------------------------------------------------- |
|
ALIGN 4 |
|
|
|
PUBLIC _longest_match |
|
_longest_match PROC |
|
|
|
; Since this code uses EBP for a scratch register, the stack frame must |
|
; be manually constructed and referenced relative to the ESP register. |
|
|
|
; Stack image |
|
; Variables |
|
chainlenwmask = 0 ; high word: current chain len |
|
; low word: s->wmask |
|
window = 4 ; local copy of s->window |
|
windowbestlen = 8 ; s->window + bestlen |
|
scanend = 12 ; last two bytes of string |
|
scanstart = 16 ; first two bytes of string |
|
scanalign = 20 ; dword-misalignment of string |
|
nicematch = 24 ; a good enough match size |
|
bestlen = 28 ; size of best match so far |
|
scan = 32 ; ptr to string wanting match |
|
varsize = 36 ; number of bytes (also offset to last saved register) |
|
|
|
; Saved Registers (actually pushed into place) |
|
ebx_save = 36 |
|
edi_save = 40 |
|
esi_save = 44 |
|
ebp_save = 48 |
|
|
|
; Parameters |
|
retaddr = 52 |
|
deflatestate = 56 |
|
curmatch = 60 |
|
|
|
; Save registers that the compiler may be using |
|
push ebp |
|
push edi |
|
push esi |
|
push ebx |
|
|
|
; Allocate local variable space |
|
sub esp,varsize |
|
|
|
; Retrieve the function arguments. ecx will hold cur_match |
|
; throughout the entire function. edx will hold the pointer to the |
|
; deflate_state structure during the function's setup (before |
|
; entering the main loop). |
|
|
|
mov edx, [esp+deflatestate] |
|
ASSUME edx:PTR DEFLATE_STATE |
|
|
|
mov ecx, [esp+curmatch] |
|
|
|
; uInt wmask = s->w_mask; |
|
; unsigned chain_length = s->max_chain_length; |
|
; if (s->prev_length >= s->good_match) { |
|
; chain_length >>= 2; |
|
; } |
|
|
|
mov eax, [edx].ds_prev_length |
|
mov ebx, [edx].ds_good_match |
|
cmp eax, ebx |
|
mov eax, [edx].ds_w_mask |
|
mov ebx, [edx].ds_max_chain_length |
|
jl SHORT LastMatchGood |
|
shr ebx, 2 |
|
LastMatchGood: |
|
|
|
; chainlen is decremented once beforehand so that the function can |
|
; use the sign flag instead of the zero flag for the exit test. |
|
; It is then shifted into the high word, to make room for the wmask |
|
; value, which it will always accompany. |
|
|
|
dec ebx |
|
shl ebx, 16 |
|
or ebx, eax |
|
mov [esp+chainlenwmask], ebx |
|
|
|
; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; |
|
|
|
mov eax, [edx].ds_nice_match |
|
mov ebx, [edx].ds_lookahead |
|
cmp ebx, eax |
|
jl SHORT LookaheadLess |
|
mov ebx, eax |
|
LookaheadLess: |
|
mov [esp+nicematch], ebx |
|
|
|
;/* register Bytef *scan = s->window + s->strstart; */ |
|
|
|
mov esi, [edx].ds_window |
|
mov [esp+window], esi |
|
mov ebp, [edx].ds_strstart |
|
lea edi, [esi+ebp] |
|
mov [esp+scan],edi |
|
|
|
;/* Determine how many bytes the scan ptr is off from being */ |
|
;/* dword-aligned. */ |
|
|
|
mov eax, edi |
|
neg eax |
|
and eax, 3 |
|
mov [esp+scanalign], eax |
|
|
|
;/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */ |
|
;/* s->strstart - (IPos)MAX_DIST(s) : NIL; */ |
|
|
|
mov eax, [edx].ds_w_size |
|
sub eax, MIN_LOOKAHEAD |
|
sub ebp, eax |
|
jg SHORT LimitPositive |
|
xor ebp, ebp |
|
LimitPositive: |
|
|
|
;/* int best_len = s->prev_length; */ |
|
|
|
mov eax, [edx].ds_prev_length |
|
mov [esp+bestlen], eax |
|
|
|
;/* Store the sum of s->window + best_len in %esi locally, and in %esi. */ |
|
|
|
add esi, eax |
|
mov [esp+windowbestlen], esi |
|
|
|
;/* register ush scan_start = *(ushf*)scan; */ |
|
;/* register ush scan_end = *(ushf*)(scan+best_len-1); */ |
|
;/* Posf *prev = s->prev; */ |
|
|
|
movzx ebx, WORD PTR[edi] |
|
mov [esp+scanstart], ebx |
|
movzx ebx, WORD PTR[eax+edi-1] |
|
mov [esp+scanend], ebx |
|
mov edi, [edx].ds_prev |
|
|
|
;/* Jump into the main loop. */ |
|
|
|
mov edx, [esp+chainlenwmask] |
|
jmp SHORT LoopEntry |
|
|
|
;/* do { |
|
; * match = s->window + cur_match; |
|
; * if (*(ushf*)(match+best_len-1) != scan_end || |
|
; * *(ushf*)match != scan_start) continue; |
|
; * [...] |
|
; * } while ((cur_match = prev[cur_match & wmask]) > limit |
|
; * && --chain_length != 0); |
|
; * |
|
; * Here is the inner loop of the function. The function will spend the |
|
; * majority of its time in this loop, and majority of that time will |
|
; * be spent in the first ten instructions. |
|
; * |
|
; * Within this loop: |
|
; * %ebx = scanend |
|
; * %ecx = curmatch |
|
; * %edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) |
|
; * %esi = windowbestlen - i.e., (window + bestlen) |
|
; * %edi = prev |
|
; * %ebp = limit |
|
; */ |
|
|
|
ALIGN 4 |
|
LookupLoop: |
|
and ecx, edx |
|
movzx ecx, WORD PTR[edi+ecx*2] |
|
cmp ecx, ebp |
|
jbe LeaveNow |
|
sub edx, 000010000H |
|
js LeaveNow |
|
|
|
LoopEntry: |
|
movzx eax, WORD PTR[esi+ecx-1] |
|
cmp eax, ebx |
|
jnz SHORT LookupLoop |
|
|
|
mov eax, [esp+window] |
|
movzx eax, WORD PTR[eax+ecx] |
|
cmp eax, [esp+scanstart] |
|
jnz SHORT LookupLoop |
|
|
|
;/* Store the current value of chainlen. */ |
|
|
|
mov [esp+chainlenwmask], edx |
|
|
|
;/* Point %edi to the string under scrutiny, and %esi to the string we */ |
|
;/* are hoping to match it up with. In actuality, %esi and %edi are */ |
|
;/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */ |
|
;/* initialized to -(MAX_MATCH_8 - scanalign). */ |
|
|
|
mov esi, [esp+window] |
|
mov edi, [esp+scan] |
|
add esi, ecx |
|
mov eax, [esp+scanalign] |
|
mov edx, -MAX_MATCH_8 |
|
lea edi, [edi+eax+MAX_MATCH_8] |
|
lea esi, [esi+eax+MAX_MATCH_8] |
|
|
|
;/* Test the strings for equality, 8 bytes at a time. At the end, |
|
; * adjust %edx so that it is offset to the exact byte that mismatched. |
|
; * |
|
; * We already know at this point that the first three bytes of the |
|
; * strings match each other, and they can be safely passed over before |
|
; * starting the compare loop. So what this code does is skip over 0-3 |
|
; * bytes, as much as necessary in order to dword-align the %edi |
|
; * pointer. (%esi will still be misaligned three times out of four.) |
|
; * |
|
; * It should be confessed that this loop usually does not represent |
|
; * much of the total running time. Replacing it with a more |
|
; * straightforward "rep cmpsb" would not drastically degrade |
|
; * performance. |
|
; */ |
|
|
|
LoopCmps: |
|
mov eax, DWORD PTR[esi+edx] |
|
xor eax, DWORD PTR[edi+edx] |
|
jnz SHORT LeaveLoopCmps |
|
|
|
mov eax, DWORD PTR[esi+edx+4] |
|
xor eax, DWORD PTR[edi+edx+4] |
|
jnz SHORT LeaveLoopCmps4 |
|
|
|
add edx, 8 |
|
jnz SHORT LoopCmps |
|
jmp LenMaximum |
|
ALIGN 4 |
|
|
|
LeaveLoopCmps4: |
|
add edx, 4 |
|
|
|
LeaveLoopCmps: |
|
test eax, 00000FFFFH |
|
jnz SHORT LenLower |
|
|
|
add edx, 2 |
|
shr eax, 16 |
|
|
|
LenLower: |
|
sub al, 1 |
|
adc edx, 0 |
|
|
|
;/* Calculate the length of the match. If it is longer than MAX_MATCH, */ |
|
;/* then automatically accept it as the best possible match and leave. */ |
|
|
|
lea eax, [edi+edx] |
|
mov edi, [esp+scan] |
|
sub eax, edi |
|
cmp eax, MAX_MATCH |
|
jge SHORT LenMaximum |
|
|
|
;/* If the length of the match is not longer than the best match we */ |
|
;/* have so far, then forget it and return to the lookup loop. */ |
|
|
|
mov edx, [esp+deflatestate] |
|
mov ebx, [esp+bestlen] |
|
cmp eax, ebx |
|
jg SHORT LongerMatch |
|
mov esi, [esp+windowbestlen] |
|
mov edi, [edx].ds_prev |
|
mov ebx, [esp+scanend] |
|
mov edx, [esp+chainlenwmask] |
|
jmp LookupLoop |
|
ALIGN 4 |
|
|
|
;/* s->match_start = cur_match; */ |
|
;/* best_len = len; */ |
|
;/* if (len >= nice_match) break; */ |
|
;/* scan_end = *(ushf*)(scan+best_len-1); */ |
|
|
|
LongerMatch: |
|
mov ebx, [esp+nicematch] |
|
mov [esp+bestlen], eax |
|
mov [edx].ds_match_start, ecx |
|
cmp eax, ebx |
|
jge SHORT LeaveNow |
|
mov esi, [esp+window] |
|
add esi, eax |
|
mov [esp+windowbestlen], esi |
|
movzx ebx, WORD PTR[edi+eax-1] |
|
mov edi, [edx].ds_prev |
|
mov [esp+scanend], ebx |
|
mov edx, [esp+chainlenwmask] |
|
jmp LookupLoop |
|
ALIGN 4 |
|
|
|
;/* Accept the current string, with the maximum possible length. */ |
|
|
|
LenMaximum: |
|
mov edx, [esp+deflatestate] |
|
mov DWORD PTR[esp+bestlen], MAX_MATCH |
|
mov [edx].ds_match_start, ecx |
|
|
|
;/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */ |
|
;/* return s->lookahead; */ |
|
|
|
LeaveNow: |
|
mov edx, [esp+deflatestate] |
|
mov ebx, [esp+bestlen] |
|
mov eax, [edx].ds_lookahead |
|
cmp ebx, eax |
|
jg SHORT LookaheadRet |
|
mov eax, ebx |
|
LookaheadRet: |
|
|
|
; Restore the stack and return from whence we came. |
|
|
|
add esp, varsize |
|
pop ebx |
|
pop esi |
|
pop edi |
|
pop ebp |
|
ret |
|
|
|
_longest_match ENDP |
|
|
|
_TEXT ENDS |
|
END
|
|
|