use Fcntl ':seek'; use Test::More qw(no_plan); # # Arguments: # $fh: A filehandle opened to a sorted, seekable source of data such as a file # $key: A target key # Returns: # The first record $R from the file such that $R ge $key # Or undef if there is no such record # Side effects: # $fh is left positioned at the beginning of $R sub search { my ($fh, $key) = @_; my ($bs, $size) = (stat($fh))[11,7]; $bs ||= 8192; # Invariant condition: Suppose the correct final position for $fh is $N. # Then $lo*$bs <= $N < $hi*$bs at all times. my ($lo, $hi) = (0, int($size/$bs)); while (1) { my $mid = int(($lo + $hi)/2); if ($mid) { # $mid probably points into the middle of a record, # skip the rest of that record and start at the next complete record seek $fh, $mid * $bs, SEEK_SET; my $junk = <$fh>; } else { # Special case for $mid at the very beginning of the file seek $fh, 0, SEEK_SET; } # The curent record starts at $start my $start = tell $fh; my $rec = <$fh>; chomp $rec; if ($hi <= $lo+1) { # We've come as close as possible # Do linear search in the current block while ($rec lt $key) { $rec = <$fh>; return unless defined $rec; chomp $rec; $start = tell $fh; } seek $fh, $start, SEEK_SET; return $rec; } if ($rec lt $key) { $lo = $mid } else { $hi = $mid } } } #my $dict = "/usr/dict/words"; my $dict = "./words"; open FH, "<", $dict or die $!; if ($TEST) { # Exact match is(search(\*FH, 'snob'), 'snob'); # Mismatch is(search(\*FH, 'snonk'), 'snook'); # Mismatches is(search(\*FH, 'snoba'), 'snobbery'); is(search(\*FH, 'snobb'), 'snobbery'); is(search(\*FH, 'snobbe'), 'snobbery'); is(search(\*FH, 'snobber'), 'snobbery'); # Exact match is(search(\*FH, 'snobbery'), 'snobbery'); # Mismatch is(search(\*FH, 'snobberys'), 'snobbish'); # First word in the file is(search(\*FH, ''), '10th'); is(search(\*FH, '1'), '10th'); # Second word in the file is(search(\*FH, '11'), '1st'); # Last word in the file is(search(\*FH, 'zy'), 'zygote'); is(search(\*FH, 'zygote'), 'zygote'); is(search(\*FH, 'zv'), 'zygote'); # Next-to-last word in the file is(search(\*FH, 'zu'), 'zucchini'); # Failure calls is(search(\*FH, 'zygotes'), undef); is(search(\*FH, 'zz'), undef); # For 'Siegmund', the newline is the first character in the block # 'Siemens'is the next word is(search(\*FH, 'Siegmund'), 'Siegmund'); is(search(\*FH, 'Siem'), 'Siemens'); is(search(\*FH, 'Siemens'), 'Siemens'); # For 'Clausen', the newline is the last character in the block # 'Clausius'is the next word is(search(\*FH, 'Clausen'), 'Clausen'); is(search(\*FH, 'Clausi'), 'Clausius'); is(search(\*FH, 'Clausius'), 'Clausius'); if ($TEST > 1) { # Now try everything open D, "<", $dict or die $!; while () { chomp; is(search(\*FH, $_), $_, $_); } } } exit if $TEST; seek FH, 0, 0; chomp(my @words = ); my ($su, $ss) = times; for my $key (@words) { while () { chomp; last if $_ ge $key; } seek FH, 0, SEEK_SET; print STDERR "# $key\n"; } my ($eu, $es) = times; my ($tu, $ts) = ($eu-$su, $es-$ss); my $time = $tu+$ts; printf "%12s: %5.2f %5.2f %6.2f\n", "Linear", $tu, $ts, $time; my ($su, $ss) = times; for my $key (@words) { search(\*FH, $key); } my ($eu, $es) = times; my ($tu, $ts) = ($eu-$su, $es-$ss); my $time = $tu+$ts; printf "%12s: %5.2f %5.2f %6.2f\n", "Binary", $tu, $ts, $time; __DATA__ ambulatory appetite armoire fan lien matriarch scarves suspense tinkle tuff