wiki:HashSourceCodes

Version 1 (modified by os, 16 years ago) (diff)

--

Hash code is based on Media Player Classic. In natural language it calculates: size + 64bit chksum of the first and last 64k (even if they overlap because the file is smaller than 128k).

Feel free to edit/add source-codes if you have faster/better implementation. Also don't forget to check, if hash is right for test. Test these 2 files please to ensure your algo is completely OK:

  • AVI file (12 909 756 bytes)
    • hash: 8e245d9679d31e12
  • DUMMY RAR file (2 565 922 bytes, 4 295 033 890 after RAR unpacking)
    • hash: 61f7751fc2a72bfb

C

#include <stdio.h>
#include <stdlib.h>

#define MAX(x,y) (((x) > (y)) ? (x) : (y))
#ifndef uint64_t
#define uint64_t unsigned long long
#endif

uint64_t compute_hash(FILE * handle)
{
	uint64_t hash, fsize;

	fseek(handle, 0, SEEK_END);
	fsize = ftell(handle);
	fseek(handle, 0, SEEK_SET);

	hash = fsize;

	for(uint64_t tmp = 0, i = 0; i < 65536/sizeof(tmp) && fread((char*)&tmp, sizeof(tmp), 1, handle); hash += tmp, i++);
	fseek(handle, (long)MAX(0, fsize - 65536), SEEK_SET);
	for(uint64_t tmp = 0, i = 0; i < 65536/sizeof(tmp) && fread((char*)&tmp, sizeof(tmp), 1, handle); hash += tmp, i++);
	
	return hash;
}

int main(int argc, char *argv)
{
	FILE * handle;
	uint64_t myhash;

	handle = fopen("breakdance.avi", "rb");
	
	if (!handle) 
	{
		printf("Error openning file!");
		return 1;
	}

	myhash = compute_hash(handle);	
	printf("%I64x", myhash);

	fclose(handle);
	return 0;
}

C++

 #include <iostream>
 #include <fstream> 
 
 typedef unsigned __int64 uint64_t;
 using namespace std;
 
 int MAX(int x, int y)
 {  
 	if((x) > (y)) 
 		return x;
 	else	
 		return y;
 }
 
 uint64_t compute_hash(ifstream& f)
 {
 	uint64_t hash, fsize;
 
 	f.seekg(0, ios::end);
 	fsize = f.tellg();
 	f.seekg(0, ios::beg);
 
 	hash = fsize;
 	for(uint64_t tmp = 0, i = 0; i < 65536/sizeof(tmp) && f.read((char*)&tmp, sizeof(tmp)); i++, hash += tmp);
 	f.seekg(MAX(0, (uint64_t)fsize - 65536), ios::beg);
 	for(tmp = 0, i = 0; i < 65536/sizeof(tmp) && f.read((char*)&tmp, sizeof(tmp)); i++, hash += tmp);
 	return hash;
 } 
 
 int main(int argc, char *argv)
 {
 	ifstream f;
 	uint64_t myhash;
 
 	f.open("c:\\test.avi", ios::in|ios::binary|ios::ate);
 	if (!f.is_open()) {
 	   cerr << "Error opening file" << endl;
 	   return 1;
 	}
 
 	myhash = compute_hash(f);
 	cout << setw(16) << setfill('0') << hex << myhash;
 
 	f.close();
 	return 0;
 }

Java

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.ByteOrder;
import java.nio.LongBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.channels.FileChannel.MapMode;


/**
 * Hash code is based on Media Player Classic. In natural language it calculates: size + 64bit
 * checksum of the first and last 64k (even if they overlap because the file is smaller than
 * 128k).
 */
public class OpenSubtitlesHasher {
	
	/**
	 * Size of the chunks that will be hashed in bytes (64 KB)
	 */
	private static final int HASH_CHUNK_SIZE = 64 * 1024;
	
	
	public static String computeHash(File file) throws IOException {
		long size = file.length();
		long chunkSizeForFile = Math.min(HASH_CHUNK_SIZE, size);
		
		FileChannel fileChannel = new FileInputStream(file).getChannel();
		
		long head = computeHashForChunk(fileChannel, 0, chunkSizeForFile);
		long tail = computeHashForChunk(fileChannel, Math.max(size - HASH_CHUNK_SIZE, 0), chunkSizeForFile);
		
		fileChannel.close();
		
		return String.format("%016x", size + head + tail);
	}
	

	private static long computeHashForChunk(FileChannel fileChannel, long start, long size) throws IOException {
		MappedByteBuffer byteBuffer = fileChannel.map(MapMode.READ_ONLY, start, size);
		
		LongBuffer longBuffer = byteBuffer.order(ByteOrder.LITTLE_ENDIAN).asLongBuffer();
		long hash = 0;
		
		while (longBuffer.hasRemaining()) {
			hash += longBuffer.get();
		}
		
		return hash;
	}
	
}

C#

using System;
using System.Text;
using System.IO;
   
namespace MovieHasher
{
    class Program
    {
        private static byte[] ComputeMovieHash(string filename)
        {
            byte[] result;
            using (Stream input = File.OpenRead(filename))
            {
                result = ComputeMovieHash(input);
            }
            return result;
        }
 
        private static byte[] ComputeMovieHash(Stream input)
        {
            long lhash, streamsize;
            streamsize = input.Length;
            lhash = streamsize;
 
            long i = 0;
            byte[] buffer = new byte[sizeof(long)];
            while (i < 65536 / sizeof(long) && (input.Read(buffer, 0, sizeof(long)) > 0))
            {
                i++;
                lhash += BitConverter.ToInt64(buffer, 0);
            }
 
            input.Position = Math.Max(0, streamsize - 65536);
            i = 0;
            while (i < 65536 / sizeof(long) && (input.Read(buffer, 0, sizeof(long)) > 0))
            {
                i++;
                lhash += BitConverter.ToInt64(buffer, 0);
            }
            input.Close();
            byte[] result = BitConverter.GetBytes(lhash);
            Array.Reverse(result);
            return result;
        }
 
        private static string ToHexadecimal(byte[] bytes)
        {
            StringBuilder hexBuilder = new StringBuilder();
            for(int i = 0; i < bytes.Length; i++)
            {
                hexBuilder.Append(bytes[i].ToString("x2"));
            }
            return hexBuilder.ToString();
        }
 
        static void Main(string[] args)
        {
            byte[] moviehash = ComputeMovieHash(@"C:\test.avi");
            Console.WriteLine("The hash of the movie-file is: {0}", ToHexadecimal(moviehash));
        }
    }
}

VB.Net

Imports System
Imports System.Text
Imports System.IO
'Note: you must remove integer overflow checking.

Namespace MovieHasher
	Class Program
		Private Shared Function ComputeMovieHash(ByVal filename As String) As Byte()
			Dim result As Byte()
			Using input As Stream = File.OpenRead(filename)
				result = ComputeMovieHash(input)
			End Using
			Return result
		End Function
		
		Private Function ComputeMovieHash(ByVal input As Stream) As Byte()
			Dim lhash As System.Int64, streamsize As Long
			streamsize = input.Length
			lhash = streamsize
			
			Dim i As Long = 0
			Dim buffer As Byte() = New Byte(Marshal.SizeOf(GetType(Long)) - 1) {}
			While i < 65536 / Marshal.SizeOf(GetType(Long)) AndAlso (input.Read(buffer, 0, Marshal.SizeOf(GetType(Long))) > 0)
				i += 1
				
				lhash += BitConverter.ToInt64(buffer, 0)
			End While
			
			input.Position = Math.Max(0, streamsize - 65536)
			i = 0
			While i < 65536 / Marshal.SizeOf(GetType(Long)) AndAlso (input.Read(buffer, 0, Marshal.SizeOf(GetType(Long))) > 0)
				i += 1
				lhash += BitConverter.ToInt64(buffer, 0)
			End While
			input.Close()
			Dim result As Byte() = BitConverter.GetBytes(lhash)
			Array.Reverse(result)
			Return result
		End Function
		
		Private Shared Function ToHexadecimal(ByVal bytes As Byte()) As String
			Dim hexBuilder As New StringBuilder()
			For i As Integer = 0 To bytes.Length - 1
				hexBuilder.Append(bytes(i).ToString("x2"))
			Next
			Return hexBuilder.ToString()
		End Function
		
		Private Shared Sub Main(ByVal args As String())
			Dim moviehash As Byte() = ComputeMovieHash("C:\test.avi")
			Console.WriteLine("The hash of the movie-file is: {0}", ToHexadecimal(moviehash))
		End Sub
	End Class
End Namespace

Python

def hashFile(name): 
      try: 
   		 
                longlongformat = 'q'  # long long 
                bytesize = struct.calcsize(longlongformat) 
                    
                f = file(name, "rb") 
                    
                filesize = os.path.getsize(name) 
                hash = filesize 
                    
                if filesize < 65536 * 2: 
                       return "SizeError" 
     		 
                for x in range(65536/bytesize): 
                        buffer = f.read(bytesize) 
                        (l_value,)= struct.unpack(longlongformat, buffer)  
                        hash += l_value 
                        hash = hash & 0xFFFFFFFFFFFFFFFF #to remain as 64bit number  
   			 
    
                f.seek(max(0,filesize-65536),0) 
                for x in range(65536/bytesize): 
                        buffer = f.read(bytesize) 
                        (l_value,)= struct.unpack(longlongformat, buffer)  
                        hash += l_value 
                        hash = hash & 0xFFFFFFFFFFFFFFFF 
                 
                f.close() 
                returnedhash =  "%016x" % hash 
                return returnedhash 
    
      except(IOError): 
                return "IOError"

Delphi

This is just a quick conversion of Gabest's original C code. Anyone who can come up with a cleaner code, please feel free to do so and post here.

function CalcGabestHash(const Filename: string): string;
var
  i: Integer;
  f: File;
  s: Array[1..8] of char;
  tmp: Int64 absolute s;
  hash: Int64;
  readed: Integer;
  OldFM: Byte;
begin
  Result := '';
  if FileExists(Filename) then begin
    { Open file }
    OldFM := FileMode;
    try
      FileMode := fmShareDenyNone;
      AssignFile(f,Filename);
      Reset(f,1);
      try
        { Start hashing }
        hash := filesize(f);
        { Read from begining }
        for i := 0 to 8191 do begin
          blockread(f,s,sizeof(s),readed);
          if readed > 0 then hash := hash + tmp else break;
        end;
        { Read from the end }
        seek(f,Max(0,filesize(f)-65536));
        for i := 0 to 8191 do begin
          blockread(f,s,sizeof(s),readed);
          if readed > 0 then hash := hash + tmp else break;
        end;
        { Finished }
        Result := Format('%.16x',[hash]);
      finally
        CloseFile(f);
      end;
    finally
      FileMode := OldFM;
    end;
  end;
end;

RealBasic

Combined routine that will calculate a fast hash for videofiles over 65K and a normal md5 for subtitles

    dim b as BinaryStream
    dim mb as MemoryBlock
    
    dim hash,bytesize as UINT64
    dim i, x, chunksize, filelen, difference as integer
    
    hash = 0 //Reset Hash
    difference = 0
    
    if f <> nil and f.Exists then
      b= f.OpenAsBinaryFile
      hash = b.Length
      bytesize = b.Length
      bytesizestr = str(bytesize)
      
      if bytesize >= 65536 and routine = "video" then
        chunksize = 65536
        mb = b.Read(65536)
        mb.LittleEndian = True
        
        for i= 0 to chunksize -1 step 8
          hash = hash+ mb.UINT64Value(i)
        next
        
        b.Position = max(b.Length-chunksize, 0)
        mb= b.Read(chunksize)
        mb.LittleEndian = True
        
        for i= 0 to chunksize -1 step 8
          hash = hash+ mb.UINT64Value(i)
        next
        
        myhash = Lowercase(str(hex(hash)))
        
      elseif routine = "subtitle" then
        
        dim c,result as string
        mb = md5(b.Read(b.Length))
        mb.LittleEndian = True
        
        for i = 0 to mb.size-1
          x = mb.byte( i )
          c = right( "00"+hex( x ), 2 )
          result = result + c
        next
        result = lowercase( result )
        myhash = result
        
      end

PHP 4/5

function OpenSubtitlesHash($file)
{
    $handle = fopen($file, "rb");
    $fsize = filesize($file);
    
    $hash = array(3 => 0, 
                  2 => 0, 
                  1 => ($fsize >> 16) & 0xFFFF, 
                  0 => $fsize & 0xFFFF);
        
    for ($i = 0; $i < 8192; $i++)
    {
        $tmp = ReadUINT64($handle);
        $hash = AddUINT64($hash, $tmp);
    }
    
    $offset = $fsize - 65536;
    fseek($handle, $offset > 0 ? $offset : 0, SEEK_SET);
    
    for ($i = 0; $i < 8192; $i++)
    {
        $tmp = ReadUINT64($handle);
        $hash = AddUINT64($hash, $tmp); 	
    }
    
    fclose($handle);
 	return UINT64FormatHex($hash);
}

function ReadUINT64($handle)
{
    $u = unpack("va/vb/vc/vd", fread($handle, 8));
    return array(0 => $u["a"], 1 => $u["b"], 2 => $u["c"], 3 => $u["d"]);
}

function AddUINT64($a, $b)
{
    $o = array(0 => 0, 1 => 0, 2 => 0, 3 => 0);

    $carry = 0;
    for ($i = 0; $i < 4; $i++) 
    {
        if (($a[$i] + $b[$i] + $carry) > 0xffff ) 
        {
            $o[$i] += ($a[$i] + $b[$i] + $carry) & 0xffff;
            $carry = 1;
        }
        else 
        {
            $o[$i] += ($a[$i] + $b[$i] + $carry);
            $carry = 0;
        }
    }
    
    return $o;   
}

function UINT64FormatHex($n)
{   
    return sprintf("%04x%04x%04x%04x", $n[3], $n[2], $n[1], $n[0]);
}

Perl

#!/usr/bin/perl
use strict;
use warnings;

print OpenSubtitlesHash('breakdance.avi');

sub OpenSubtitlesHash {
	my $filename = shift or die("Need video filename");

	open my $handle, "<", $filename or die $!;
	my $fsize = -s $filename;

	my $hash = [$fsize & 0xFFFF, ($fsize >> 16) & 0xFFFF, 0, 0];

	$hash = AddUINT64($hash, ReadUINT64($handle)) for (1..8192);

    my $offset = $fsize - 65536;
    seek($handle, $offset > 0 ? $offset : 0, 0) or die $!;

    $hash = AddUINT64($hash, ReadUINT64($handle)) for (1..8192);

    close $handle or die $!;
    return UINT64FormatHex($hash);
}

sub ReadUINT64 {
	read($_[0], my $u, 8);
   	return [unpack("vvvv", $u)];
}

sub AddUINT64 {
    my $o = [0,0,0,0];
    my $carry = 0;
    for my $i (0..3) {
        if (($_[0]->[$i] + $_[1]->[$i] + $carry) > 0xffff ) {
			$o->[$i] += ($_[0]->[$i] + $_[1]->[$i] + $carry) & 0xffff;
			$carry = 1;
		} else {
			$o->[$i] += ($_[0]->[$i] + $_[1]->[$i] + $carry);
			$carry = 0;
		}
	}
    return $o;
}

sub UINT64FormatHex {
    return sprintf("%04x%04x%04x%04x", $_[0]->[3], $_[0]->[2], $_[0]->[1], $_[0]->[0]);
}

Ruby

This is a quick translation/transliteration of the Perl script.

class Hasher

  def open_subtitles_hash(filename)
    raise "Need video filename" unless filename

    fh = File.open(filename)
    fsize = File.size(filename)

    hash = [fsize & 0xffff, (fsize >> 16) & 0xffff, 0, 0]

    8192.times { hash = add_unit_64(hash, read_uint_64(fh)) }

    offset = fsize - 65536
    fh.seek([0,offset].max, 0)

    8192.times { hash = add_unit_64(hash, read_uint_64(fh)) }

    fh.close

    return uint_64_format_hex(hash)
  end

  def read_uint_64(stream)
    stream.read(8).unpack("vvvv")
  end

  def add_unit_64(hash, input)
    res = [0,0,0,0]
    carry = 0

    hash.zip(input).each_with_index do |(h,i),n|
      sum = h + i + carry
      if sum > 0xffff
        res[n] += sum & 0xffff
        carry = 1
      else
        res[n] += sum
        carry = 0
      end
    end
    return res
  end

  def uint_64_format_hex(hash)
    sprintf("%04x%04x%04x%04x", *hash.reverse)
  end
end

if __FILE__ == $0
  require 'test/unit'

  class HashTester < Test::Unit::TestCase
    def setup
      @h = Hasher.new
    end

    def test_test_file_hash
      assert_equal("8e245d9679d31e12", @h.open_subtitles_hash('breakdance.avi'))
    end
  end
end


Another more "rubyesque" implementation.

class MovieHasher

  HASH_CHUNK_SIZE = 64 * 1024 # in bytes
  HASH_SIZE = 8 # in bytes
  
  def self.compute_hash(filename)
    file = File.open(filename, "rb")
    filesize = File.size(filename)
    
    hash = filesize
    
    # Read 64 kbytes, divide up into 64 bits and add each
    # to hash. Do for beginning and end of file.
    
    # Q = unsigned long long (64 bit = HASH_SIZE)
    file.read(HASH_CHUNK_SIZE).unpack("Q*").each { |n| hash += n }
    file.seek([0, filesize - HASH_CHUNK_SIZE].max, IO::SEEK_SET)
    file.read(HASH_CHUNK_SIZE).unpack("Q*").each { |n| hash += n }
    
    file.close
    
    hash &= 0xffffffffffffffff # keep the first 8 bytes / 64 bits
    
    sprintf("%0" + (HASH_SIZE * 2).to_s +  "x", hash)
  end
end

if __FILE__ == $0
  require 'test/unit'

  class MovieHasherTest < Test::Unit::TestCase
    def test_compute_hash
      assert_equal("8e245d9679d31e12", MovieHasher::compute_hash('breakdance.avi'))
    end
  end
end

Attachments (1)

Download all attachments as: .zip