Blog

A catalogue of my discoveries in software development and related subjects, that I think might be of use or interest to everyone else, or to me when I forget what I did!

Thread.Join Deadlock with Invoke

September 07, 2007

If you are trying to Make Thread-Safe Calls to Windows Forms Controls then you can run into deadlock problems when calling 'Invoke' from your thread, if your main thread is waiting for a Thread.Join on the calling thread. I posted a workaround for this on the Microsoft site, which uses an extra thread to make the call to Invoke method, to avoid the deadlock. VB.NET example:
Private Sub SetText(ByVal str_text As String)
        If Me.textbox1.InvokeRequired = True Then
                'call an asyncronous invoke, by calling it then forcing a DoEvents
                Dim asyncInvokeThread As New Threading.Thread(New Threading.ParameterizedThreadStart(AddressOf AsyncInvoke))
                asyncInvokeThread.IsBackground = True
                asyncInvokeThread.Start(str_text)

                Application.DoEvents()
        Else
                me.textbox1.text=str_text
        End If
End Sub
Private Sub AsyncInvoke(ByVal obj_text As Object)
        Dim str_text As String = CStr(obj_text)
        Dim d As New SetTextCallback(AddressOf SetText)
        Me.Invoke(d, New Object() {str_message})
End Sub
Permalink: Thread.Join Deadlock with Invoke

Bio - Kinetic

March 01, 2007

I then started working for a worldwide advertising media group, Kinetic, where I worked primarily on in-house systems. The development was mainly in ASP.NET and MS-SQL, although it involved the incorporation of many other technologies, including writing client side controls, Microsoft SharePoint and the research and development of any other potential useful technology.

Kinetic Logo
Permalink: Bio - Kinetic

x86 Assembly Base64 Encoder

September 19, 2006

I joined in a "competition" with some of the experts in the assembly forums over at experts-exchange to write the fastest assembly code Base64 encoder, following the guidelines of the RFC3548. Several versions emerged, some using lookup tables to precompute results, which faired well on CPUs which large caches. My version was more about opcode optimization and using bitwise arithmetic over decimal arithmetic, which achieved best results on CPUs that have better pipelining. Code:
  void ToBase64( BYTE* pSrc, char* pszOutBuf, int len )
{
      char* chr_table="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";

       __asm{
                  mov ecx, len
                  mov esi, pSrc                              //;bytes from source
                  mov edi, chr_table
                  push ebp
                  mov ebp, pszOutBuf

src_byteLoop:

                  xor eax, eax

                  //;read 3 bytes
                  mov ah, byte ptr[esi]
                  mov al, byte ptr[esi+1]
                  shl eax, 16
                  mov ah, byte ptr[esi+2]

                  //;manipulate in edx bitset1
                  mov edx, eax
                  shl eax, 6                                    //;done first 6 bits

                  shr edx, 26            
                  mov bl, byte ptr [edi+edx]            //;put char in buffer
                  mov byte ptr[ebp], bl
                  inc ebp                                          //;next buf

                  //;manipulate in edx bitset2
                  mov edx, eax
                  shl eax, 6                                    //;done first 6 bits

                  shr edx, 26
                  mov bl, byte ptr [edi+edx]            //;put char in buffer
                  mov byte ptr[ebp], bl
                  inc ebp                                          //;next buf

                  //;manipulate in edx bitset3
                  mov edx, eax
                  shl eax, 6                                    //;done first 6 bits

                  shr edx, 26
                  mov bl, byte ptr [edi+edx]            //;put char in buffer
                  mov byte ptr[ebp], bl
                  inc ebp                                          //;next buf

                  //;manipulate in edx bitset4
                  mov edx, eax
                  shl eax, 6                                    //;done first 6 bits

                  shr edx, 26
                  mov bl, byte ptr [edi+edx]            //;put char in buffer
                  mov byte ptr[ebp], bl
                  inc ebp                                          //;next buf

                  //;done these bytes
                  add esi, 3
                  sub ecx, 3

                  cmp ecx, 3
                  jge src_byteLoop                        //;still got src bytes

                  xor eax, eax                              //;set to zero (pad count)
                  cmp ecx, 0
                  jz finished

                        //;need to pad out some extra bytes

                        //;read in 3 bytes regardless of junk data following pSrc - already zero from above)
                        mov ah, byte ptr[esi]
                        mov al, byte ptr[esi+1]
                        shl eax, 16
                        mov ah, byte ptr[esi+2]

                        sub ecx, 3                                    //;bytes just read
                        neg ecx                                          //;+ve inverse
                        mov edx, ecx                              //;save how many bytes need padding

                        //;as per the RFC, any padded bytes should be 0s
                        mov esi, 0xFFFFFF
                        lea ecx, dword ptr[ecx*8+8]            //;calculate bitmask to shift
                        shl esi, cl
                        and eax, esi                              //;mask out the junk bytes

                        mov ecx, edx                              //;restore pad count

                        //;manipulate in edx byte 1
                        mov edx, eax
                        shl eax, 6                                    //;done first 6 bits                        

                        shr edx, 26
                        mov bl, byte ptr [edi+edx]            //;put char in buffer
                        mov byte ptr[ebp], bl
                        inc ebp                                          //;next buf

                        //;manipulate in edx byte 2
                        mov edx, eax
                        shl eax, 6                                    //;done first 6 bits                        

                        shr edx, 26
                        mov bl, byte ptr [edi+edx]            //;put char in buffer
                        mov byte ptr[ebp], bl
                        inc ebp                                          //;next buf

                        //;manipulate in edx byte 3
                        mov edx, eax
                        shl eax, 6                                    //;done first 6 bits                        

                        shr edx, 26
                        mov bl, byte ptr [edi+edx]            //;put char in buffer
                        mov byte ptr[ebp], bl
                        inc ebp                                          //;next buf

                        //;manipulate in edx byte 3
                        mov edx, eax
                        shl eax, 6                                    //;done first 6 bits                        

                        shr edx, 26
                        mov bl, byte ptr [edi+edx]                  //;put char in buffer
                        mov byte ptr[ebp], bl
                        inc ebp                                          //;next buf

                        mov eax, ecx                              //;'return' pad count

finished:
                  test eax, eax
                  jz end
                  //;some bytes were padding, put them as =

                        sub ebp, eax                        //;move ptr back for num bytes to pad
padChars:
                        mov byte ptr[ebp], 0x3d            //;=
                        inc ebp
                        dec eax

                        jnz padChars

end:
                  pop ebp
        }      
}
There were several key points to my optimization technique:
  • Firstly unrolling of loops, which basically means eliminating any kind of loop structures in the code, in favour of manually typing the code to perform the operations on the data. In terms of asm optimizations this reduces branching and shifting EIP too often.
  • Another key point, was minimizing the load/store operations performed. To keep these down, you can use indirect addressing to get to the byte in memory you are interested in.e.g. mov edx, base add edx, edi mov bl, byte ptr[edx] becomes mov bl, [edx+edi]
  • As Base64 encoding requires you to work at bit level using 3 bytes at a time, you can speed up the conversion of the bit sets by using the binary arithmetic operations of SHL and SHR (shift left/right) to progressively take the most significant bits of one DWORD and operate on them with a lower significance by shifting them.For example, if EAX contains a DWORD and you copy that the EDX. You can get rid of the left most 6 bits of EAX by shifting left 6. You can deal with those 6 bits in EDX, by shifting EDX right 26 times. This is much quicker than shifting out one bit at a time.
My final throughput on the official test machine was 196.92 MB/s, which is almost double the throughput of the CryptoAPI function, which came in at 104.05 MB/s
Permalink: x86 Assembly Base64 Encoder

Bio - Adtec Software

September 01, 2005

After leaving University I found my first job in software development at Adtec Software, where I helped to develop software and database solutions, mainly for Debt Collection Agencies. During my experience there I continued to develop my skills, learning new applications, learning real life development techniques and dealing directly with clients. It gave me the opportunity to put my skills into practice and to learn more about how development of software and database applications is carried out in industry. I was also given the opportunity to study Microsoft professional courses in VB .NET.

Adtec Logo
Permalink: Bio - Adtec Software

C++ Port Scanner using Winsock with Timeout

July 30, 2005

When writing the UltraChat program, I needed to write a port scanner that would look for running UltraChat servers. A port scanner basically attempts to connect to a host on a given port and return true or false if a connection was (or wasn't) established. The problem with doing this is that when a connection cannot be establish, Winsock (windows sockets) will keep trying, allowing for servers that are slow to reply. Which then slows the whole process down. To get around this I used a simple threading and timer technique to kill the request if it doesnt succeed within a given time. Code:
bool quickConnect(char* remoteHostIP, int port)
{
      struct sockaddr_in sAddress;

      //set up connection info
      sAddress.sin_family = AF_INET;
      sAddress.sin_port = htons(port);                                                      //port
      sAddress.sin_addr.S_un.S_addr = inet_addr(remoteHostIP);                  //IP

      //use a thread to do the connection
      HANDLE tConnThread;
      DWORD threadID;
      DWORD tExitCode=0;

      tConnThread=CreateThread(NULL,0,&quickConnectTTL,&sAddress,0,&threadID);

      //now wait 1seconds maximum for response
      SYSTEMTIME now;
      GetSystemTime(&now);

      int finishTime=(now.wDay * 24 * 60)+(now.wHour * 60)+(now.wSecond)+1;
      int nowTime=0;

      while(nowTime<finishTime){
            GetSystemTime(&now);
            nowTime=(now.wDay * 24 * 60)+(now.wHour * 60)+(now.wSecond);
            //check if already exited, dont waste time
            GetExitCodeThread(tConnThread,&tExitCode);
            if(tExitCode!=STILL_ACTIVE){
                  break;
            }
      }

      //get the return value from connection
      GetExitCodeThread(tConnThread,&tExitCode);

      //if thread did not exit, time is up, close the thread and assume no server
      if(tExitCode==STILL_ACTIVE){
            TerminateThread(tConnThread,0);
            tExitCode=0;
      }

      CloseHandle(tConnThread);

      bool present;
      //check return values
      if(tExitCode==1){
            present=true;
      }
      else{
            present=false;
      }

      return present;

}

DWORD WINAPI quickConnectTTL(LPVOID sAddressPTR){
      //copy struct info from PTR
      struct sockaddr_in sAddress;
      memcpy(&sAddress,sAddressPTR,sizeof(struct sockaddr_in));

      //connect to remote host
      int concode;
      SOCKET s=socket(AF_INET,SOCK_STREAM,0);
      concode=connect(s ,(struct sockaddr *)&sAddress,sizeof(sAddress));

      if(concode==SOCKET_ERROR){
            ExitThread(0);
            return 0;
      }
      else{
            closesocket(s);
            ExitThread(1);
            return 1;
      }
}
The above example waits 1 second for a reply, and then assumes a false result. To use the code, you need to include the windows sockets headers and initialise the winsock, then call quickConnect passing the host/port.
Permalink: C++ Port Scanner using Winsock with Timeout

Bio - College and University

July 01, 2005

I have decided to deprecate my old bio and have backdated this blog to incorporate the old content:-

I have had an interest in computers for as long as I can remember now, but my real interest in programming started when I got my first home PC in 2000. I decided to teach myself HTML, as this was an easy language to start with and there is lots of available material. I decided to follow up my interest by applying for a course at Bradford College, where I was advised to take the AVCE in ICT, in order to have a broader knowledge base of all IT aspects. Upon finishing the course my skills had come a long way, especially with my programming, as I had studied Visual Basic as part of my course, and also took it upon myself to study extra curricular courses with City & Guilds in Java & C++.

I was then advised by my tutors of a HND course in Software Engineering, which sounded like the perfect course for me. I took the course very seriously as this was now developing into a viable career for myself, in something I loved doing. I passed the HND with honours, getting 13 distinctions and 2 merits. I then went on to Bradford University to take my career even further, where I studied a BSc Hons Degree in Software Development Applications, for which I received a 1st class with honours.

During this time, I did quite a bit of work for local companies, who required websites for their businesses, although this was not my preferred line of work I thought it useful to have some experience in the business world. I have developed websites for 'Xtreme Tints', 'Shalwaar-Kameez Company', 'Ahmad Textiles' & 'Storm Motorsport'. I developed my own personal website 'craigwardman.com' as well as making an online community for car enthusiasts 'BadRides.co.uk' which is now closed. I have also done more extensive IT work for 'The Shalwaar-Kameez Company' such as graphic design, remote security camera systems and general PC maintenance.

To keep with my main interest in programming, I am constantly developing my own software solutions, to help me in my everyday tasks, one of which, being 'MediaViewer' a picture/video/audio slideshowing, browsing and search program, which is available to download in the projects section of this site. I have also developed a chat program in C++, which was originally for my BSc final year project. Using the Win32 API and Windows Sockets I enjoyed building this program and hope to continue its development (also available to download in the projects section).
Permalink: Bio - College and University

Extending Functionality of a compiled PE using DLLs

June 29, 2005

Sometimes it is necessary to extend the functionality of a program to which you don't have the source code. This tutorial focuses on adding functionality to a compiled Windows Portable Executable file, but the idea can probably be implemented on other compiled binaries. Adding simple functionality can be achieved directly in 'code caves' within the PE file by adding the assembly code directly to the exe to perform the operation (i.e. use a hex editor with assembler to render the opcodes directly into some empty space in the exe file) The more complex the functionality, the more assembly code this will require, which needs bigger code caves and takes more coding on your part. You can create bigger code caves by adding new sections to the file, but that is not the topic of this tutorial. An easier way, I have found, is to code the extra functionality in a DLL, which is best coded in C++/C as these are easier to import into the target app. You should use the extern "C" macro on your exports to make them asm friendly (no mangling) and specify 'declspec(dllexport)' to make the compiler put the function in the exports.. For example: (AddOns.h)
#ifdef ADDONS_EXPORTS
#define ADDONS_API extern "C" __declspec(dllexport)
#else
#define ADDONS_API __declspec(dllimport)
#endif

ADDONS_API bool someFunction(LPCTSTR someTextParam);
You can code the core of the functionality in the methods defined in your DLL and then the only assembly you need to code is loading your DLL and calling the functions. The above function for example, will be in a DLL called 'AddOns.dll' and take 1 paramater (a pointer to a string). In assembly code you can load the dll using the LoadLibrary function and find the address of the function using GetProcAddress. If the exe does not import these functions, read my other blog post on Finding the address of GetProcAddress.
push &"AddOns.dll"                ; address of string for the DLL, in a code cave
call &LoadLibraryA                ; address of the imported/loaded LoadLibraryA function
push &"someFunction"              ; address of string for the function, in a code cave
push eax                          ; HMODULE AddOns.dll (returned from LoadLibrary)
call &GetProcAddressA             ; address of the imported/loaded GetProcAddressA function
push &"SomeString"                ; address in the app to the parameter you want to pass
call eax                          ; call the function in the dll (returned from GetProcAddress)
add esp, 4                        ; clear the stack
jmp &returnAddress                    ; go back to the original code?
Since C functions use the cdecl calling convention by default, you have to clean the stack of your params when you are finished and if your function returns a value, it will be in EAX. Once you have put the above code in a cave somewhere, you simply need to jmp to it at the appropriate point in the original exe and then have it jmp back to the best place in the code to give control back to the normal program flow when you have finished.
Permalink: Extending Functionality of a compiled PE using DLLs

Finding the address of GetProcAddress

June 29, 2005

When working within the confines of a compiled PE file (.exe) you often want to call external functions stored in DLLs. If the DLL/function was imported when the exe was compiled then it will be referenced in the imports table; DLLs/functions in the imports table will be loaded by the Windows loader when the PE is executed, you can then use the IAT (import address table) to get the address to call these functions. However, most of the time the DLL and/or function you want to use was not originally imported by the PE file and will therefore not have an entry in the IAT. You can load a DLL manually by calling LoadLibrary and GetProcAddress in your assembly code, passing in the name of the DLL and the name of the function you wish to load respectively. This method relies on the exe at least having the two imports: Kernel32.LoadLibrary Kernel32.GetProcAddress Without these two imports, you have to do something a little more elaborate. The method I will outline relies on the exe having only one import: Kernel32.GetModuleHandle Which most Win32 PE files will import. This function returns HMODULE, which is actually the base address of the loaded DLL. (If the file you are working on doesn't import any of these functions, you can try using the standard Kernel32.dll base address of 0xBFF70000 or use some other technique, such as reading the PEB, TEB or using SEH, try Googling.) Assuming you now have the base address of Kernel32.dll you can read the PE headers of the image to find the exported GetProcAddress. Once you have GetProcAddress, you can use it to find LoadLibrary and you then have the two functions you need to load any number of DLLs/functions.
mov eax,myExe.&"Kernel32.dll"                       ;Address of ASCII "Kernel32.dll" - already in imports
push eax
mov eax,<jmp.&Kernel32.GetModuleHandleA>            ;IAT Thunk
call eax
push eax                                            ;hMod
add eax,dword ptr ds:[eax+3c]                       ;eax+e_lfanew
mov ebx,dword ptr ds:[eax+78]                       ;PE+78h IMAGE_DIRECTORY_ENTRY_EXPORT
mov eax,dword ptr ss:[esp]                          ;hMod
add eax,ebx                                         ;VA to IMAGE_EXPORT_DIRECTORY
pop edx                                             ;save hMod
push eax                                            ;save Exports Section address
push edx                                            ;hMod on top
mov ebx,dword ptr ds:[eax+20]                       ;Read IMAGE_EXPORT_DIRECTORY+20h (AddressOfNames[])
pop eax                                             ;hMod
xor ecx,ecx                                         ;reset counter
:label1
inc ecx                                             ;count++
push ebx                                            ;save RVA AddressOfNames[]
push eax                                            ;save hMod
add eax,ebx                                         ;VA AddressOfNames[count]
mov ebx,dword ptr ds:[eax]                          ;RVA FunctionName
mov eax,dword ptr ss:[esp]                          ;hMod
add eax,ebx                                         ;VA of FunctionName
mov esi,eax
pop eax                                             ;hMod
pop ebx                                             ;RVA AddressOfNames[]
mov edi,myExe.&"GetProcA"                           ;ASCII "GetProcA" - Put this in a cave
cmps dword ptr ds:[esi],dword ptr es:[edi]          ;(DWORD cmp) FunctionName = "GetP"
je short myExe.&label2                              ;match, check next half
add ebx,4                                           ;else, next RVA (4byte RVAs)
jmp short myExe.&label1                             ;loop
:label2
add ebx,4                                           ;next RVA (4byte RVAs)
cmps dword ptr ds:[esi],dword ptr es:[edi]          ;(2nd DWORD cmp) FunctionName = "rocA"
jnz short myExe.&label1                             ;if not, loop again
pop edx                                             ;VA Exports Section
mov ebx,dword ptr ds:[edx+1c]                       ;read IMAGE_EXPORT_DIRECTORY+1Ch (AddressOfFunctions[])
push eax                                            ;hMod
mov eax,ecx                                         ;ordinal+1
dec eax                                             ;ordinal
mov ecx,4                                           ;4byte RVAs
mul ecx                                             ;ordinal*4
add ebx,eax                                         ;move ordinal*4 bytes in AddressOfFunctions[]
pop eax                                             ;hMod
push eax                                            ;kernel32.77E60000
add eax,ebx                                         ;RVA FunctionAddress
pop ebx                                             ;hMod
push ebx                                            ;save for later - kernel base
add ebx,dword ptr ds:[eax]                          ;VA FunctionAddress
push ebx                                            ;entry GetProcAddress
The above code snippet leaves you with the VA of GetProcAddress on the stack (and in EBX). You can the go on to find LoadLibrary..
push myExe.&"LoadLibraryA"                      ;Address of ASCII "LoadLibraryA" - in a cave
push dword ptr ss:[esp+8]                       ;hMod Kernel32 - still on the stack from above
call ebx                                        ;call GetProcAddress - returns address to EAX
push eax                                        ;Save to stack
You have now loaded GetProcAddress and LoadLibrary and have them both on the stack ready for use.
Permalink: Finding the address of GetProcAddress

x86 Assembly Bubble Sort

January 09, 2005

The following example shows how you can use a bubble sort in Assembly language to sort some numbers:
.386
.model flat,stdcall

option casemap:none

.data
example_data db 1,3,4,5,2,5,7,4,6,0
num_of_elements db 10

.code
start:
    mov eax, dword ptr[num_of_elements] ;whatever the programmer entered
    dec eax                             ;less one (since 10 elements = 0-9)
    mov dword ptr[num_of_elements], eax ;save the new value

    lea eax, example_data               ;point eax to start addr
    xor ebx, ebx                        ;reset (data reg 1)
    xor edx, edx                        ;reset (data reg 2)
    xor ecx, ecx                        ;reset counter

stillsort:
    mov bl, byte ptr[eax]               ;get 1 byte
    mov dl, byte ptr[eax+1]             ;and the byte to its right
    cmp bl, dl                          ;compare the 2
    jg notdone                          ;if byte 2 &gt; byte 1, not sorted, go sort
    push eax                            ;save where we are
    push ecx                            ;save counter
    lea eax, example_data               ;go back to start (for test)
    xor ecx, ecx                        ;reset counter
    jmp test_sorted                     ;go test the whole list

notdone:
    mov byte ptr[eax], dl               ;put byte 2 in byte 1 position
    mov byte ptr[eax+1], bl             ;put byte 1 in byte 2 position
    inc eax                             ;go to next byte
    inc ecx                             ;count
    cmp ecx, dword ptr[num_of_elements] ;10 elements (0-9)
    jnz stillsort                       ;still sorting (no reset)
    lea eax, example_data               ;did all 10 elements, go again from start
    xor ecx, ecx                        ;reset counter
    jmp stillsort                       ;back to sort code

test_sorted:
    mov bl, byte ptr[eax]               ;get 1 byte
    mov dl, byte ptr[eax+1]             ;and the byte to its right
    cmp bl, dl                          ;compare
    jg nope                             ;if byte 2 &gt; byte 1 the whole list isnt sorted
    inc eax                             ;try next byte
    inc ecx                             ;count
    cmp ecx, dword ptr[num_of_elements] ;10 elements (0-9)
    jz done                             ;all 10 elements are sorted
    jmp test_sorted                     ;or loop

nope:
    pop ecx                             ;get the back the old count
    pop eax                             ;back to the last byte we were on
    inc eax                             ;but 1 more now
    inc ecx                             ;increase counter
    cmp ecx, dword ptr[num_of_elements] ;10 elements (0-9)
    jnz stillsort                       ;sorting
    lea eax, example_data               ;it was the last element, back to start
    xor ecx, ecx                        ;reset counter
    jmp stillsort                       ;sorting

done:
    pop ecx                             ;clear stack
    pop eax                             ;clear stack
    xor eax, eax                        ;exit code 0
    ret
end start
Permalink: x86 Assembly Bubble Sort