--- trunk/mkinitrd-magellan/isolinux/bcopy32.inc 2007/09/01 22:45:15 532 +++ trunk/mkinitrd-magellan/isolinux/bcopy32.inc 2010/08/19 09:50:43 1133 @@ -1,7 +1,7 @@ -;; $Id: bcopy32.inc,v 1.1 2007-09-01 22:44:04 niro Exp $ ;; ----------------------------------------------------------------------- -;; -;; Copyright 1994-2005 H. Peter Anvin - All Rights Reserved +;; +;; Copyright 1994-2009 H. Peter Anvin - All Rights Reserved +;; Copyright 2009 Intel Corporation; author: H. Peter Anvin ;; ;; This program is free software; you can redistribute it and/or modify ;; it under the terms of the GNU General Public License as published by @@ -13,7 +13,7 @@ ;; ;; bcopy32.inc -;; +;; ;; 32-bit bcopy routine for real mode ;; @@ -27,110 +27,134 @@ ; segments, but this stuff is painful enough as it is without having to rely ; on everything happening "as it ought to." ; -; NOTE: this code is relocated into low memory, just after the .earlybss -; segment, in order to support to "bcopy over self" operation. -; - section .bcopy32 - align 8 -__bcopy_start: - - ; This is in the .text segment since it needs to be - ; contiguous with the rest of the bcopy stuff - -bcopy_gdt: dw bcopy_gdt_size-1 ; Null descriptor - contains GDT - dd bcopy_gdt ; pointer for LGDT instruction - dw 0 - dd 0000ffffh ; Code segment, use16, readable, - dd 00009b00h ; present, dpl 0, cover 64K - dd 0000ffffh ; Data segment, use16, read/write, - dd 008f9300h ; present, dpl 0, cover all 4G - dd 0000ffffh ; Data segment, use16, read/write, - dd 00009300h ; present, dpl 0, cover 64K - ; The rest are used for COM32 only - dd 0000ffffh ; Code segment, use32, readable, - dd 00cf9b00h ; present, dpl 0, cover all 4G - dd 0000ffffh ; Data segment, use32, read/write, - dd 00cf9300h ; present, dpl 0, cover all 4G -bcopy_gdt_size: equ $-bcopy_gdt + bits 16 + section .text ; ; bcopy: ; 32-bit copy, overlap safe ; ; Inputs: -; ESI - source pointer +; ESI - source pointer (-1 means do bzero rather than bcopy) ; EDI - target pointer ; ECX - byte count ; DF - zero ; ; Outputs: -; ESI - first byte after source +; ESI - first byte after source (garbage if ESI == -1 on entry) ; EDI - first byte after target -; ECX - zero ; -bcopy: push eax - push esi - push edi - push ecx - pushf ; Saves, among others, the IF flag +bcopy: jecxz .ret + pushad + push word pm_bcopy + call simple_pm_call + popad + add edi,ecx + add esi,ecx +.ret: ret + +; +; shuffle_and_boot_raw: +; The new version of shuffle and boot. +; Inputs: +; ESI -> Pointer to list of (dst, src, len) pairs(*) +; EDI -> Pointer to safe area for list + shuffler +; (must not overlap this code nor the RM stack) +; ECX -> Byte count of list area (for initial copy) +; +; If src == -1: then the memory pointed to by (dst, len) is bzeroed; +; this is handled inside the bcopy routine. +; +; If len == 0: this marks the end of the list; dst indicates +; the entry point and src the mode (0 = pm, 1 = rm) +; +shuffle_and_boot_raw: + push word pm_shuffle + call simple_pm_call + ; Never returns... + jmp kaboom + +; +; This routine is used to invoke a simple routine in 32-bit protected +; mode (with 32-bit zero-based CS, DS, ES, and SS, with ESP pointing to the +; real-mode stack even if the real-mode stack was in a nonzero SS.) +; +; No interrupt thunking services are provided; interrupts are disabled +; for the duration of the routine. Don't run for too long at a time +; unless you really mean it. +; +; Inputs: +; On stack - pm entrypoint (IP only) +; EAX, EBP preserved until real-mode exit +; EBX, ECX, EDX, ESI and EDI passed to the called routine +; +; Outputs: +; EAX, EBP restored from real-mode entry +; All other registers as returned from called function +; PM entrypoint cleaned off stack +; +simple_pm_call: + push eax + push ebp + movzx ebp,sp ; BP is used as frame pointer + pushfd ; Saves, among others, the IF flag push ds push es + push fs + push gs cli call enable_a20 + mov byte [cs:bcopy_gdt.TSS+5],89h ; Mark TSS unbusy + + ; Convert the stack segment to a base + xor eax,eax + mov ax,ss + shl eax,4 + add ebp,eax ; EBP is now an absolute frame ptr + + ; Save the old segmented stack pointer + mov [cs:.rm_esp],esp + mov [cs:.rm_ss],ss + o32 lgdt [cs:bcopy_gdt] mov eax,cr0 or al,1 mov cr0,eax ; Enter protected mode - jmp 08h:.in_pm + jmp PM_CS32:.in_pm -.in_pm: mov ax,10h ; Data segment selector - mov es,ax - mov ds,ax - - ; Don't mess with ss, fs, and gs. They are never changed - ; and should be able to make it back out of protected mode. - ; This works because (and only because) we don't take - ; interrupt in protected mode. - - cmp esi,edi ; If source > destination, we might - ja .reverse ; have to copy backwards - -.forward: - mov al,cl ; Save low bits - and al,3 - shr ecx,2 ; Convert to dwords - a32 rep movsd ; Do our business - ; At this point ecx == 0 - - mov cl,al ; Copy any fractional dword - a32 rep movsb - jmp .exit - -.reverse: - std ; Reverse copy - lea esi,[esi+ecx-1] ; Point to final byte - lea edi,[edi+ecx-1] - mov eax,ecx - and ecx,3 - shr eax,2 - a32 rep movsb - - ; Change ESI/EDI to point to the last dword, instead - ; of the last byte. - sub esi,3 - sub edi,3 - mov ecx,eax - a32 rep movsd + bits 32 +.in_pm: + mov ax,PM_DS32 + mov ss,eax + lea esp,[ebp-8*4-2*4] ; Flat mode stack + mov es,eax + mov ds,eax + + ; Set fs, gs, tr, and ldtr in case we're on a virtual + ; machine running on Intel VT hardware -- it can't + ; deal with a partial transition, for no good reason. + + mov al,PM_DS16 ; Real-mode-like segment + mov fs,eax + mov gs,eax + mov al,PM_TSS ; Intel VT really doesn't want + ltr ax ; an invalid TR and LDTR, so give + xor eax,eax ; it something that it can use... + lldt ax ; (sigh) - cld + movzx eax,word [ebp+2*4+2] + call eax ; Call actual routine + jmp PM_CS16:.exit + bits 16 .exit: - mov ax,18h ; "Real-mode-like" data segment - mov es,ax - mov ds,ax + mov ax,PM_DS16 ; "Real-mode-like" data segment + mov es,eax + mov ds,eax + mov ss,eax mov eax,cr0 and al,~1 @@ -138,19 +162,24 @@ jmp 0:.in_rm .in_rm: ; Back in real mode + lss esp,[cs:.rm_esp] ; Restore the stack + pop gs + pop fs pop es pop ds - call disable_a20 - popf ; Re-enables interrupts - pop eax - pop edi - pop esi - add edi,eax - add esi,eax + popfd ; Re-enables interrupts + pop ebp pop eax - ret + ret 2 ; Drops the pm entry + + section .bss + alignb 4 +.rm_esp resd 1 +.rm_ss resw 1 + + section .text ; ; Routines to enable and disable (yuck) A20. These routines are gathered ; from tips from a couple of sources, including the Linux kernel and @@ -158,57 +187,39 @@ ; is indicated by Donnie Barnes of RedHat, the problematic system being an ; IBM ThinkPad 760EL. ; -; We typically toggle A20 twice for every 64K transferred. -; -%define io_delay call _io_delay -%define IO_DELAY_PORT 80h ; Invalid port (we hope!) -%define disable_wait 32 ; How long to wait for a disable - -; Note the skip of 2 here -%define A20_DUNNO 0 ; A20 type unknown -%define A20_NONE 2 ; A20 always on? -%define A20_BIOS 4 ; A20 BIOS enable -%define A20_KBC 6 ; A20 through KBC -%define A20_FAST 8 ; A20 through port 92h -slow_out: out dx, al ; Fall through - -_io_delay: out IO_DELAY_PORT,al - out IO_DELAY_PORT,al - ret + section .data + alignz 2 +A20Ptr dw a20_dunno + + section .bss + alignb 4 +A20Test resd 1 ; Counter for testing A20 status +A20Tries resb 1 ; Times until giving up on A20 + section .text enable_a20: pushad mov byte [cs:A20Tries],255 ; Times to try to make this work try_enable_a20: -; -; Flush the caches -; -%if DO_WBINVD - call try_wbinvd -%endif ; -; If the A20 type is known, jump straight to type +; First, see if we are on a system with no A20 gate, or the A20 gate +; is already enabled for us... ; - mov bp,[cs:A20Type] - jmp word [cs:bp+A20List] - -; -; First, see if we are on a system with no A20 gate -; -a20_dunno: a20_none: - mov byte [cs:A20Type], A20_NONE call a20_test jnz a20_done + ; Otherwise, see if we had something memorized... + jmp word [cs:A20Ptr] ; ; Next, try the BIOS (INT 15h AX=2401h) ; +a20_dunno: a20_bios: - mov byte [cs:A20Type], A20_BIOS + mov word [cs:A20Ptr], a20_bios mov ax,2401h pushf ; Some BIOSes muck with IF int 15h @@ -225,9 +236,9 @@ call empty_8042 jnz a20_done ; A20 live, no need to use KBC - mov byte [cs:A20Type], A20_KBC ; Starting KBC command sequence + mov word [cs:A20Ptr], a20_kbc ; Starting KBC command sequence - mov al,0D1h ; Command write + mov al,0D1h ; Write output port out 064h, al call empty_8042_uncond @@ -235,6 +246,13 @@ out 060h, al call empty_8042_uncond + ; Apparently the UHCI spec assumes that A20 toggle + ; ends with a null command (assumed to be for sychronization?) + ; Put it here to see if it helps anything... + mov al,0FFh ; Null command + out 064h, al + call empty_8042_uncond + ; Verify that A20 actually is enabled. Do that by ; observing a word in low memory and the same word in ; the HMA until they are no longer coherent. Note that @@ -253,7 +271,7 @@ ; Running out of options here. Final attempt: enable the "fast A20 gate" ; a20_fast: - mov byte [cs:A20Type], A20_FAST ; Haven't used the KBC yet + mov word [cs:A20Ptr], a20_fast in al, 092h or al,02h and al,~01h ; Don't accidentally reset the machine! @@ -272,13 +290,16 @@ ; Oh bugger. A20 is not responding. Try frobbing it again; eventually give up ; and report failure to the user. ; - - dec byte [cs:A20Tries] - jnz try_enable_a20 + jnz a20_dunno ; Did we get the wrong type? mov si, err_a20 jmp abort_load + + section .data +err_a20 db CR, LF, 'A20 gate not responding!', CR, LF, 0 + section .text + ; ; A20 unmasked, proceed... ; @@ -290,76 +311,28 @@ ; This routine tests if A20 is enabled (ZF = 0). This routine ; must not destroy any register contents. ; +; The no-write early out avoids the io_delay in the (presumably common) +; case of A20 already enabled (e.g. from a previous call.) +; a20_test: push es push cx - push ax - mov cx,0FFFFh ; HMA = segment 0FFFFh + push eax + mov cx,0FFFFh ; HMA = segment 0FFFFh mov es,cx - mov cx,32 ; Loop count - mov ax,[cs:A20Test] -.a20_wait: inc ax - mov [cs:A20Test],ax - io_delay ; Serialize, and fix delay - cmp ax,[es:A20Test+10h] - loopz .a20_wait -.a20_done: pop ax + mov eax,[cs:A20Test] + mov cx,32 ; Loop count + jmp .test ; First iteration = early out +.wait: add eax,0x430aea41 ; A large prime number + mov [cs:A20Test],eax + io_delay ; Serialize, and fix delay +.test: cmp eax,[es:A20Test+10h] + loopz .wait +.done: pop eax pop cx pop es ret -disable_a20: - pushad -; -; Flush the caches -; -%if DO_WBINVD - call try_wbinvd -%endif - - mov bp,[cs:A20Type] - jmp word [cs:bp+A20DList] - -a20d_bios: - mov ax,2400h - pushf ; Some BIOSes muck with IF - int 15h - popf - jmp short a20d_snooze - -; -; Disable the "fast A20 gate" -; -a20d_fast: - in al, 092h - and al,~03h - out 092h, al - jmp short a20d_snooze - -; -; Disable the keyboard controller A20 gate -; -a20d_kbc: - call empty_8042_uncond - mov al,0D1h - out 064h, al ; Command write - call empty_8042_uncond - mov al,0DDh ; A20 off - out 060h, al - call empty_8042_uncond - ; Wait a bit for it to take effect -a20d_snooze: - push cx - mov cx, disable_wait -.delayloop: call a20_test - jz .disabled - loop .delayloop -.disabled: pop cx -a20d_dunno: -a20d_none: - popad - ret - ; ; Routine to empty the 8042 KBC controller. If dl != 0 ; then we will test A20 in the loop and exit if A20 is @@ -383,76 +356,9 @@ test al,2 jnz empty_8042 io_delay -.done: ret - -; -; Execute a WBINVD instruction if possible on this CPU -; -%if DO_WBINVD -try_wbinvd: - wbinvd - ret -%endif +.done: ret ; -; bcopy_over_self: -; -; This routine is used to shuffle memory around, followed by -; invoking an entry point somewhere in low memory. This routine -; can clobber any memory above 7C00h, we therefore have to move -; necessary code into the trackbuf area before doing the copy, -; and do adjustments to anything except BSS area references. -; -; NOTE: Since PXELINUX relocates itself, put all these -; references in the ".earlybss" segment. -; -; After performing the copy, this routine resets the stack and -; jumps to the specified entrypoint. -; -; IMPORTANT: This routine does not canonicalize the stack or the -; SS register. That is the responsibility of the caller. +; The 32-bit copy and shuffle code is "special", so it is in its own file ; -; Inputs: -; DS:BX -> Pointer to list of (dst, src, len) pairs -; AX -> Number of list entries -; [CS:EntryPoint] -> CS:IP to jump to -; On stack - initial state (fd, ad, ds, es, fs, gs) -; -shuffle_and_boot: - and ax,ax - jz .done -.loop: - mov edi,[bx] - mov esi,[bx+4] - mov ecx,[bx+8] - call bcopy - add bx,12 - dec ax - jnz .loop - -.done: - pop gs - pop fs - pop es - pop ds - popad - popfd - jmp far [cs:EntryPoint] - - align 2 -A20List dw a20_dunno, a20_none, a20_bios, a20_kbc, a20_fast -A20DList dw a20d_dunno, a20d_none, a20d_bios, a20d_kbc, a20d_fast -a20_adjust_cnt equ ($-A20List)/2 - -A20Type dw A20_NONE ; A20 type - - ; Total size of .bcopy32 section - alignb 4, db 0 ; Even number of dwords -__bcopy_size equ $-__bcopy_start - - section .earlybss - alignb 2 -EntryPoint resd 1 ; CS:IP for shuffle_and_boot -SavedSSSP resd 1 ; Saved real mode SS:SP -A20Test resw 1 ; Counter for testing status of A20 -A20Tries resb 1 ; Times until giving up on A20 +%include "bcopyxx.inc"