wiki:HashSourceCodes

Version 10 (modified by guest, 15 years ago) (diff)

I needed to make these changes to the python code to get it to work (tested on the sample files)

Hash code is based on Media Player Classic. In natural language it calculates: size + 64bit chksum of the first and last 64k (even if they overlap because the file is smaller than 128k). On opensubtitles.org is movie file size limited to 9000000000 > $moviebytesize > 131072 bytes, if is there any reason to change these sizes, let us know.

Feel free to edit/add source-codes if you have faster/better implementation. Also don't forget to check, if hash is right for test. Test these 2 files please to ensure your algo is completely OK:

  • AVI file (12 909 756 bytes)
    • hash: 8e245d9679d31e12
  • DUMMY RAR file (2 565 922 bytes, 4 295 033 890 after RAR unpacking)
    • hash: 61f7751fc2a72bfb

C

#include <stdio.h>
#include <stdlib.h>

#define MAX(x,y) (((x) > (y)) ? (x) : (y))
#ifndef uint64_t
#define uint64_t unsigned long long
#endif

uint64_t compute_hash(FILE * handle)
{
        uint64_t hash, fsize;

        fseek(handle, 0, SEEK_END);
        fsize = ftell(handle);
        fseek(handle, 0, SEEK_SET);

        hash = fsize;

        for(uint64_t tmp = 0, i = 0; i < 65536/sizeof(tmp) && fread((char*)&tmp, sizeof(tmp), 1, handle); hash += tmp, i++);
        fseek(handle, (long)MAX(0, fsize - 65536), SEEK_SET);
        for(uint64_t tmp = 0, i = 0; i < 65536/sizeof(tmp) && fread((char*)&tmp, sizeof(tmp), 1, handle); hash += tmp, i++);
        
        return hash;
}

int main(int argc, char *argv)
{
        FILE * handle;
        uint64_t myhash;

        handle = fopen("breakdance.avi", "rb");
        
        if (!handle) 
        {
                printf("Error openning file!");
                return 1;
        }

        myhash = compute_hash(handle);  
        printf("%I64x", myhash);

        fclose(handle);
        return 0;
}

C++

 #include <iostream>
 #include <fstream> 
 
 typedef unsigned __int64 uint64_t;
 using namespace std;
 
 int MAX(int x, int y)
 {  
        if((x) > (y)) 
                return x;
        else    
                return y;
 }
 
 uint64_t compute_hash(ifstream& f)
 {
        uint64_t hash, fsize;
 
        f.seekg(0, ios::end);
        fsize = f.tellg();
        f.seekg(0, ios::beg);
 
        hash = fsize;
        for(uint64_t tmp = 0, i = 0; i < 65536/sizeof(tmp) && f.read((char*)&tmp, sizeof(tmp)); i++, hash += tmp);
        f.seekg(MAX(0, (uint64_t)fsize - 65536), ios::beg);
        for(tmp = 0, i = 0; i < 65536/sizeof(tmp) && f.read((char*)&tmp, sizeof(tmp)); i++, hash += tmp);
        return hash;
 } 
 
 int main(int argc, char *argv)
 {
        ifstream f;
        uint64_t myhash;
 
        f.open("c:\\test.avi", ios::in|ios::binary|ios::ate);
        if (!f.is_open()) {
           cerr << "Error opening file" << endl;
           return 1;
        }
 
        myhash = compute_hash(f);
        cout << setw(16) << setfill('0') << hex << myhash;
 
        f.close();
        return 0;
 }

Java

/**
 * Hash code is based on Media Player Classic. In natural language it calculates: size + 64bit
 * checksum of the first and last 64k (even if they overlap because the file is smaller than
 * 128k).
 */
public class OpenSubtitlesHasher {
        
        /**
         * Size of the chunks that will be hashed in bytes (64 KB)
         */
        private static final int HASH_CHUNK_SIZE = 64 * 1024;
        
        
        public static String computeHash(File file) throws IOException {
                long size = file.length();
                long chunkSizeForFile = Math.min(HASH_CHUNK_SIZE, size);
                
                FileChannel fileChannel = new FileInputStream(file).getChannel();
                
                try {
                        long head = computeHashForChunk(fileChannel.map(MapMode.READ_ONLY, 0, chunkSizeForFile));
                        long tail = computeHashForChunk(fileChannel.map(MapMode.READ_ONLY, Math.max(size - HASH_CHUNK_SIZE, 0), chunkSizeForFile));
                        
                        return String.format("%016x", size + head + tail);
                } finally {
                        fileChannel.close();
                }
        }
        

        public static String computeHash(InputStream stream, long length) throws IOException {
                
                int chunkSizeForFile = (int) Math.min(HASH_CHUNK_SIZE, length);
                
                // buffer that will contain the head and the tail chunk, chunks will overlap if length is smaller than two chunks
                byte[] chunkBytes = new byte[(int) Math.min(2 * HASH_CHUNK_SIZE, length)];
                
                DataInputStream in = new DataInputStream(stream);
                
                // first chunk
                in.readFully(chunkBytes, 0, chunkSizeForFile);
                
                long position = chunkSizeForFile;
                long tailChunkPosition = length - chunkSizeForFile;
                
                // seek to position of the tail chunk, or not at all if length is smaller than two chunks
                while (position < tailChunkPosition && (position += in.skip(tailChunkPosition - position)) >= 0);
                
                // second chunk, or the rest of the data if length is smaller than two chunks
                in.readFully(chunkBytes, chunkSizeForFile, chunkBytes.length - chunkSizeForFile);
                
                long head = computeHashForChunk(ByteBuffer.wrap(chunkBytes, 0, chunkSizeForFile));
                long tail = computeHashForChunk(ByteBuffer.wrap(chunkBytes, chunkBytes.length - chunkSizeForFile, chunkSizeForFile));
                
                return String.format("%016x", length + head + tail);
        }
        

        private static long computeHashForChunk(ByteBuffer buffer) {
                
                LongBuffer longBuffer = buffer.order(ByteOrder.LITTLE_ENDIAN).asLongBuffer();
                long hash = 0;
                
                while (longBuffer.hasRemaining()) {
                        hash += longBuffer.get();
                }
                
                return hash;
        }
        
}

C#

If you get overflow error read this.

using System;
using System.Text;
using System.IO;
   
namespace MovieHasher
{
    class Program
    {
        private static byte[] ComputeMovieHash(string filename)
        {
            byte[] result;
            using (Stream input = File.OpenRead(filename))
            {
                result = ComputeMovieHash(input);
            }
            return result;
        }
 
        private static byte[] ComputeMovieHash(Stream input)
        {
            long lhash, streamsize;
            streamsize = input.Length;
            lhash = streamsize;
 
            long i = 0;
            byte[] buffer = new byte[sizeof(long)];
            while (i < 65536 / sizeof(long) && (input.Read(buffer, 0, sizeof(long)) > 0))
            {
                i++;
                lhash += BitConverter.ToInt64(buffer, 0);
            }
 
            input.Position = Math.Max(0, streamsize - 65536);
            i = 0;
            while (i < 65536 / sizeof(long) && (input.Read(buffer, 0, sizeof(long)) > 0))
            {
                i++;
                lhash += BitConverter.ToInt64(buffer, 0);
            }
            input.Close();
            byte[] result = BitConverter.GetBytes(lhash);
            Array.Reverse(result);
            return result;
        }
 
        private static string ToHexadecimal(byte[] bytes)
        {
            StringBuilder hexBuilder = new StringBuilder();
            for(int i = 0; i < bytes.Length; i++)
            {
                hexBuilder.Append(bytes[i].ToString("x2"));
            }
            return hexBuilder.ToString();
        }
 
        static void Main(string[] args)
        {
            byte[] moviehash = ComputeMovieHash(@"C:\test.avi");
            Console.WriteLine("The hash of the movie-file is: {0}", ToHexadecimal(moviehash));
        }
    }
}

VB.Net

Imports System
Imports System.Text
Imports System.IO
'Note: you must remove integer overflow checking.

Namespace MovieHasher
	Class Program
		Private Shared Function ComputeMovieHash(ByVal filename As String) As Byte()
			Dim result As Byte()
			Using input As Stream = File.OpenRead(filename)
				result = ComputeMovieHash(input)
			End Using
			Return result
		End Function
		
		Private Function ComputeMovieHash(ByVal input As Stream) As Byte()
			Dim lhash As System.Int64, streamsize As Long
			streamsize = input.Length
			lhash = streamsize
			
			Dim i As Long = 0
			Dim buffer As Byte() = New Byte(Marshal.SizeOf(GetType(Long)) - 1) {}
			While i < 65536 / Marshal.SizeOf(GetType(Long)) AndAlso (input.Read(buffer, 0, Marshal.SizeOf(GetType(Long))) > 0)
				i += 1
				
				lhash += BitConverter.ToInt64(buffer, 0)
			End While
			
			input.Position = Math.Max(0, streamsize - 65536)
			i = 0
			While i < 65536 / Marshal.SizeOf(GetType(Long)) AndAlso (input.Read(buffer, 0, Marshal.SizeOf(GetType(Long))) > 0)
				i += 1
				lhash += BitConverter.ToInt64(buffer, 0)
			End While
			input.Close()
			Dim result As Byte() = BitConverter.GetBytes(lhash)
			Array.Reverse(result)
			Return result
		End Function
		
		Private Shared Function ToHexadecimal(ByVal bytes As Byte()) As String
			Dim hexBuilder As New StringBuilder()
			For i As Integer = 0 To bytes.Length - 1
				hexBuilder.Append(bytes(i).ToString("x2"))
			Next
			Return hexBuilder.ToString()
		End Function
		
		Private Shared Sub Main(ByVal args As String())
			Dim moviehash As Byte() = ComputeMovieHash("C:\test.avi")
			Console.WriteLine("The hash of the movie-file is: {0}", ToHexadecimal(moviehash))
		End Sub
	End Class
End Namespace

Python

import struct, os

def hashFile(name): 
      try: 
                 
                longlongformat = 'q'  # long long 
                bytesize = struct.calcsize(longlongformat) 
                    
                f = open(name, "rb") 
                    
                filesize = os.path.getsize(name) 
                hash = filesize 
                    
                if filesize < 65536 * 2: 
                       return "SizeError" 
                 
                for x in range(65536/bytesize): 
                        buffer = f.read(bytesize) 
                        (l_value,)= struct.unpack(longlongformat, buffer)  
                        hash += l_value 
                        hash = hash & 0xFFFFFFFFFFFFFFFF #to remain as 64bit number  
                         
    
                f.seek(max(0,filesize-65536),0) 
                for x in range(65536/bytesize): 
                        buffer = f.read(bytesize) 
                        (l_value,)= struct.unpack(longlongformat, buffer)  
                        hash += l_value 
                        hash = hash & 0xFFFFFFFFFFFFFFFF 
                 
                f.close() 
                returnedhash =  "%016x" % hash 
                return returnedhash 
    
      except(IOError): 
                return "IOError"

Delphi

This is just a quick conversion of Gabest's original C code. Anyone who can come up with a cleaner code, please feel free to do so and post here.

function CalcGabestHash(const fname: string): string;
var
  i : integer;
  s : array[1..8] of char;
  tmp       : Int64 absolute s;
  hash      : Int64;
  readed    : integer;

  aStream: TFileStream;
begin
  result := '';
  if not FileExists(fname) then Exit;

  aStream := TFileStream.Create(fName, fmShareDenyNone);
  hash := aStream.Size;

  i := 0; readed := 1;
  while ((i < 8192) and (readed > 0)) do begin
    readed := aStream.Read(s, sizeof(s));
    if readed > 0 then
    begin
      hash := hash + tmp;
    end;
    i := i + 1;
  end;

  aStream.Seek(-65536, soFromEnd); // 65536

  i := 0; readed:= 1;
  while ((i < 8192) and (readed > 0)) do begin
    readed := aStream.Read(s, sizeof(s));
    if readed > 0 then
      hash := hash + tmp;
    i := i + 1;
  end;
  aStream.Free;
  result := Format('%.16x',[hash]);
end;

RealBasic

Combined routine that will calculate a fast hash for videofiles over 65K and a normal md5 for subtitles

    dim b as BinaryStream
    dim mb as MemoryBlock
    
    dim hash,bytesize as UINT64
    dim i, x, chunksize, filelen, difference as integer
    
    hash = 0 //Reset Hash
    difference = 0
    
    if f <> nil and f.Exists then
      b= f.OpenAsBinaryFile
      hash = b.Length
      bytesize = b.Length
      bytesizestr = str(bytesize)
      
      if bytesize >= 65536 and routine = "video" then
        chunksize = 65536
        mb = b.Read(65536)
        mb.LittleEndian = True
        
        for i= 0 to chunksize -1 step 8
          hash = hash+ mb.UINT64Value(i)
        next
        
        b.Position = max(b.Length-chunksize, 0)
        mb= b.Read(chunksize)
        mb.LittleEndian = True
        
        for i= 0 to chunksize -1 step 8
          hash = hash+ mb.UINT64Value(i)
        next
        
        myhash = Lowercase(str(hex(hash)))
        
      elseif routine = "subtitle" then
        
        dim c,result as string
        mb = md5(b.Read(b.Length))
        mb.LittleEndian = True
        
        for i = 0 to mb.size-1
          x = mb.byte( i )
          c = right( "00"+hex( x ), 2 )
          result = result + c
        next
        result = lowercase( result )
        myhash = result
        
      end

PHP 4/5

function OpenSubtitlesHash($file)
{
    $handle = fopen($file, "rb");
    $fsize = filesize($file);
    
    $hash = array(3 => 0, 
                  2 => 0, 
                  1 => ($fsize >> 16) & 0xFFFF, 
                  0 => $fsize & 0xFFFF);
        
    for ($i = 0; $i < 8192; $i++)
    {
        $tmp = ReadUINT64($handle);
        $hash = AddUINT64($hash, $tmp);
    }
    
    $offset = $fsize - 65536;
    fseek($handle, $offset > 0 ? $offset : 0, SEEK_SET);
    
    for ($i = 0; $i < 8192; $i++)
    {
        $tmp = ReadUINT64($handle);
        $hash = AddUINT64($hash, $tmp);         
    }
    
    fclose($handle);
        return UINT64FormatHex($hash);
}

function ReadUINT64($handle)
{
    $u = unpack("va/vb/vc/vd", fread($handle, 8));
    return array(0 => $u["a"], 1 => $u["b"], 2 => $u["c"], 3 => $u["d"]);
}

function AddUINT64($a, $b)
{
    $o = array(0 => 0, 1 => 0, 2 => 0, 3 => 0);

    $carry = 0;
    for ($i = 0; $i < 4; $i++) 
    {
        if (($a[$i] + $b[$i] + $carry) > 0xffff ) 
        {
            $o[$i] += ($a[$i] + $b[$i] + $carry) & 0xffff;
            $carry = 1;
        }
        else 
        {
            $o[$i] += ($a[$i] + $b[$i] + $carry);
            $carry = 0;
        }
    }
    
    return $o;   
}

function UINT64FormatHex($n)
{   
    return sprintf("%04x%04x%04x%04x", $n[3], $n[2], $n[1], $n[0]);
}

Perl

#!/usr/bin/perl
use strict;
use warnings;

print OpenSubtitlesHash('breakdance.avi');

sub OpenSubtitlesHash {
        my $filename = shift or die("Need video filename");

        open my $handle, "<", $filename or die $!;
        my $fsize = -s $filename;

        my $hash = [$fsize & 0xFFFF, ($fsize >> 16) & 0xFFFF, 0, 0];

        $hash = AddUINT64($hash, ReadUINT64($handle)) for (1..8192);

    my $offset = $fsize - 65536;
    seek($handle, $offset > 0 ? $offset : 0, 0) or die $!;

    $hash = AddUINT64($hash, ReadUINT64($handle)) for (1..8192);

    close $handle or die $!;
    return UINT64FormatHex($hash);
}

sub ReadUINT64 {
        read($_[0], my $u, 8);
        return [unpack("vvvv", $u)];
}

sub AddUINT64 {
    my $o = [0,0,0,0];
    my $carry = 0;
    for my $i (0..3) {
        if (($_[0]->[$i] + $_[1]->[$i] + $carry) > 0xffff ) {
                        $o->[$i] += ($_[0]->[$i] + $_[1]->[$i] + $carry) & 0xffff;
                        $carry = 1;
                } else {
                        $o->[$i] += ($_[0]->[$i] + $_[1]->[$i] + $carry);
                        $carry = 0;
                }
        }
    return $o;
}

sub UINT64FormatHex {
    return sprintf("%04x%04x%04x%04x", $_[0]->[3], $_[0]->[2], $_[0]->[1], $_[0]->[0]);
}

Ruby

This is a quick translation/transliteration of the Perl script.

class Hasher

  def open_subtitles_hash(filename)
    raise "Need video filename" unless filename

    fh = File.open(filename)
    fsize = File.size(filename)

    hash = [fsize & 0xffff, (fsize >> 16) & 0xffff, 0, 0]

    8192.times { hash = add_unit_64(hash, read_uint_64(fh)) }

    offset = fsize - 65536
    fh.seek([0,offset].max, 0)

    8192.times { hash = add_unit_64(hash, read_uint_64(fh)) }

    fh.close

    return uint_64_format_hex(hash)
  end

  def read_uint_64(stream)
    stream.read(8).unpack("vvvv")
  end

  def add_unit_64(hash, input)
    res = [0,0,0,0]
    carry = 0

    hash.zip(input).each_with_index do |(h,i),n|
      sum = h + i + carry
      if sum > 0xffff
        res[n] += sum & 0xffff
        carry = 1
      else
        res[n] += sum
        carry = 0
      end
    end
    return res
  end

  def uint_64_format_hex(hash)
    sprintf("%04x%04x%04x%04x", *hash.reverse)
  end
end

if __FILE__ == $0
  require 'test/unit'

  class HashTester < Test::Unit::TestCase
    def setup
      @h = Hasher.new
    end

    def test_test_file_hash
      assert_equal("8e245d9679d31e12", @h.open_subtitles_hash('breakdance.avi'))
    end
  end
end


Another more "rubyesque" implementation.

module MovieHasher

  CHUNK_SIZE = 64 * 1024 # in bytes

  def self.compute_hash(filename)
    filesize = File.size(filename)
    hash = filesize

    # Read 64 kbytes, divide up into 64 bits and add each
    # to hash. Do for beginning and end of file.
    File.open(filename, 'rb') do |f|    
      # Q = unsigned long long = 64 bit
      f.read(CHUNK_SIZE).unpack("Q*").each do |n|
        hash = hash + n & 0xffffffffffffffff # to remain as 64 bit number
      end

      f.seek([0, filesize - CHUNK_SIZE].max, IO::SEEK_SET)

      # And again for the end of the file
      f.read(CHUNK_SIZE).unpack("Q*").each do |n|
        hash = hash + n & 0xffffffffffffffff
      end
    end

    sprintf("%016x", hash)
  end
end

if __FILE__ == $0
  require 'test/unit'

  class MovieHasherTest < Test::Unit::TestCase
    def test_compute_hash
      assert_equal("8e245d9679d31e12", MovieHasher::compute_hash('breakdance.avi'))
    end

    def test_compute_hash_large_file
      assert_equal("61f7751fc2a72bfb", MovieHasher::compute_hash('dummy.bin'))
    end
  end
end

Attachments (1)

Download all attachments as: .zip