linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [RFC][PATCH] allow bigger PAGE_OFFSET with PAE
@ 2003-01-07 20:06 Dave Hansen
  2003-01-07 23:37 ` William Lee Irwin III
  0 siblings, 1 reply; 5+ messages in thread
From: Dave Hansen @ 2003-01-07 20:06 UTC (permalink / raw)
  To: Linux Kernel Mailing List; +Cc: linux-mm

[-- Attachment #1: Type: text/plain, Size: 1157 bytes --]

Currently, with PAE enabled, we require the user:kernel split to occur
on a PMD boundary, so it can only be done in 1GB increments.  There
are 2 reasons for this.  First, kernel_physical_mapping_init()
assumes, when it is initializing the kernel's PMD entries, that they
start at offset 0 inside the PMD.  This is fixed by starting them at 
__pmd_offset(PAGE_OFFSET) instead.

Secondly, secondary SMP cpus require that the trampoline code be
identity mapped (map virtual addresses to the same as physical ones). 
  Right now, this is accomplished by setting the first PGD entry to
be the same as the last.  This is OK, as long as that PGD is
eventually mapping to physical 0x00000000.  My changes above break
that.  So, I allocate another PMD, and use it for the identity
mapping.  The current code is in place to allocate PTE if you're
using PAE without PSE support, but there is nothing to free them.  Any
suggestions for a clean way to do this?

Also, this gets the kernel's pagetables right, but neglects 
userspace's for now.  pgd_alloc() needs to be fixed to allocate 
another PMD, if the split isn't PMD-alighed.
-- 
Dave Hansen
haveblue@us.ibm.com


[-- Attachment #2: unaligned-page_offset-pae-2.5.53-3.patch --]
[-- Type: text/plain, Size: 4181 bytes --]

diff -ur linux-2.5.53-clean/arch/i386/mm/init.c linux-2.5.53-weirdsplit/arch/i386/mm/init.c
--- linux-2.5.53-clean/arch/i386/mm/init.c	Mon Dec 23 21:21:03 2002
+++ linux-2.5.53-weirdsplit/arch/i386/mm/init.c	Mon Jan  6 09:41:02 2003
@@ -117,6 +117,24 @@
 	}
 }
 
+
+/*
+ * Abstract out using large pages when mapping KVA, or the SMP identity
+ * mapping
+ */
+void pmd_map_pfn_range(pmd_t* pmd_entry, unsigned long pfn, unsigned long max_pfn)
+{
+	int pte_ofs;
+	/* Map with big pages if possible, otherwise create normal page tables. */
+	if (cpu_has_pse) {
+		set_pmd(pmd_entry, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
+		pfn += PTRS_PER_PTE;
+	} else {
+		pte_t* pte = one_page_table_init(pmd_entry);
+		for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn < max_pfn; pte++, pfn++, pte_ofs++)
+			set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
+	}
+}
 /*
  * This maps the physical memory to kernel virtual address space, a total 
  * of max_low_pfn pages, by creating page tables starting from address 
@@ -127,8 +145,7 @@
 	unsigned long pfn;
 	pgd_t *pgd;
 	pmd_t *pmd;
-	pte_t *pte;
-	int pgd_ofs, pmd_ofs, pte_ofs;
+	int pgd_ofs, pmd_ofs;
 
 	pgd_ofs = __pgd_offset(PAGE_OFFSET);
 	pgd = pgd_base + pgd_ofs;
@@ -138,19 +155,47 @@
 		pmd = one_md_table_init(pgd);
 		if (pfn >= max_low_pfn)
 			continue;
-		for (pmd_ofs = 0; pmd_ofs < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_ofs++) {
-			/* Map with big pages if possible, otherwise create normal page tables. */
-			if (cpu_has_pse) {
-				set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
-				pfn += PTRS_PER_PTE;
-			} else {
-				pte = one_page_table_init(pmd);
-
-				for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++)
-					set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
-			}
+	
+		/* beware of starting KVA in the middle of a pmd. */
+		if( pgd_ofs == __pgd_offset(PAGE_OFFSET) ) {
+			pmd_ofs = __pmd_offset(PAGE_OFFSET);
+			pmd = &pmd[pmd_ofs];
+		} else
+			pmd_ofs = 0;
+		
+		for (; pmd_ofs < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_ofs++) {
+			pmd_map_pfn_range(pmd, pfn, max_low_pfn);
+			pfn += PTRS_PER_PTE; 
 		}
-	}	
+	}
+}
+
+
+/*
+ * Add low memory identity-mappings - SMP needs it when
+ * starting up on an AP from real-mode. In the non-PAE
+ * case we already have these mappings through head.S.
+ * All user-space mappings are explicitly cleared after
+ * SMP startup in zap_low_mappings().
+ */
+static void __init low_physical_mapping_init(pgd_t *pgd_base)
+{
+#if CONFIG_X86_PAE
+	unsigned long pfn = 0;
+	int pmd_ofs = 0;
+	pmd_t *pmd = one_md_table_init(pgd_base);
+
+	if(!cpu_has_pse) {
+		printk("PAE enabled, but no support for PSE (large pages)!");
+		printk("this is likely to waste some RAM.");
+	}
+	
+	for (; pmd_ofs < PTRS_PER_PMD && pfn <= max_low_pfn; pmd++, pmd_ofs++) { 
+		pmd_map_pfn_range(pmd, pfn, max_low_pfn);
+		pfn += PTRS_PER_PTE;
+	}
+#endif
+			
 }
 
 static inline int page_kills_ppro(unsigned long pagenr)
@@ -213,7 +258,7 @@
 	pgd = swapper_pg_dir + __pgd_offset(vaddr);
 	pmd = pmd_offset(pgd, vaddr);
 	pte = pte_offset_kernel(pmd, vaddr);
-	pkmap_page_table = pte;	
+	pkmap_page_table = pte;
 }
 
 void __init one_highpage_init(struct page *page, int pfn, int bad_ppro)
@@ -278,6 +323,7 @@
 	}
 
 	kernel_physical_mapping_init(pgd_base);
+	low_physical_mapping_init(pgd_base);
 	remap_numa_kva();
 
 	/*
@@ -286,19 +332,7 @@
 	 */
 	vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
 	page_table_range_init(vaddr, 0, pgd_base);
-
 	permanent_kmaps_init(pgd_base);
-
-#if CONFIG_X86_PAE
-	/*
-	 * Add low memory identity-mappings - SMP needs it when
-	 * starting up on an AP from real-mode. In the non-PAE
-	 * case we already have these mappings through head.S.
-	 * All user-space mappings are explicitly cleared after
-	 * SMP startup.
-	 */
-	pgd_base[0] = pgd_base[USER_PTRS_PER_PGD];
-#endif
 }
 
 void zap_low_mappings (void)
@@ -310,6 +344,7 @@
 	 * Note that "pgd_clear()" doesn't do it for
 	 * us, because pgd_clear() is a no-op on i386.
 	 */
+	free_page(pgd_page(swapper_pg_dir[0]));
 	for (i = 0; i < USER_PTRS_PER_PGD; i++)
 #if CONFIG_X86_PAE
 		set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC][PATCH] allow bigger PAGE_OFFSET with PAE
  2003-01-07 20:06 [RFC][PATCH] allow bigger PAGE_OFFSET with PAE Dave Hansen
@ 2003-01-07 23:37 ` William Lee Irwin III
  2003-01-08 21:04   ` Dave Hansen
  0 siblings, 1 reply; 5+ messages in thread
From: William Lee Irwin III @ 2003-01-07 23:37 UTC (permalink / raw)
  To: Dave Hansen; +Cc: Linux Kernel Mailing List, linux-mm

On Tue, Jan 07, 2003 at 12:06:38PM -0800, Dave Hansen wrote:
> Also, this gets the kernel's pagetables right, but neglects 
> userspace's for now.  pgd_alloc() needs to be fixed to allocate 
> another PMD, if the split isn't PMD-alighed.

Um, that should be automatic when USER_PTRS_PER_PGD is increased.

I see the following:

$ grep -n TASK_SIZE include/asm-i386/*.h                 
include/asm-i386/a.out.h:22:#define STACK_TOP   TASK_SIZE
include/asm-i386/elf.h:60:#define ELF_ET_DYN_BASE         (TASK_SIZE / 3 * 2)
include/asm-i386/pgtable.h:68:#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE)
include/asm-i386/processor.h:277:#define TASK_SIZE      (PAGE_OFFSET)
include/asm-i386/processor.h:282:#define TASK_UNMAPPED_BASE     (PAGE_ALIGN(TASK_SIZE / 3))


... which sounds like you need to round up in an overflow-safe fashion
in the macro.


Bill
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC][PATCH] allow bigger PAGE_OFFSET with PAE
  2003-01-07 23:37 ` William Lee Irwin III
@ 2003-01-08 21:04   ` Dave Hansen
  2003-01-08 22:05     ` William Lee Irwin III
  0 siblings, 1 reply; 5+ messages in thread
From: Dave Hansen @ 2003-01-08 21:04 UTC (permalink / raw)
  To: William Lee Irwin III; +Cc: Linux Kernel Mailing List, linux-mm

William Lee Irwin III wrote:
> On Tue, Jan 07, 2003 at 12:06:38PM -0800, Dave Hansen wrote:
> 
>>Also, this gets the kernel's pagetables right, but neglects 
>>userspace's for now.  pgd_alloc() needs to be fixed to allocate 
>>another PMD, if the split isn't PMD-alighed.
> 
> Um, that should be automatic when USER_PTRS_PER_PGD is increased.

Nope, you need a little bit more.  pgd_alloc() relies on its memcpy() 
to provide the kernel mappings.  After the last user PMD is allocated, 
you still need to copy the kernel-shared part of it in.

-- 
Dave Hansen
haveblue@us.ibm.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC][PATCH] allow bigger PAGE_OFFSET with PAE
  2003-01-08 21:04   ` Dave Hansen
@ 2003-01-08 22:05     ` William Lee Irwin III
  2003-01-08 22:44       ` William Lee Irwin III
  0 siblings, 1 reply; 5+ messages in thread
From: William Lee Irwin III @ 2003-01-08 22:05 UTC (permalink / raw)
  To: Dave Hansen; +Cc: Linux Kernel Mailing List, linux-mm

On Tue, Jan 07, 2003 at 12:06:38PM -0800, Dave Hansen wrote:
>>> Also, this gets the kernel's pagetables right, but neglects 
>>> userspace's for now.  pgd_alloc() needs to be fixed to allocate 
>>> another PMD, if the split isn't PMD-alighed.

William Lee Irwin III wrote:
>> Um, that should be automatic when USER_PTRS_PER_PGD is increased.

On Wed, Jan 08, 2003 at 01:04:23PM -0800, Dave Hansen wrote:
> Nope, you need a little bit more.  pgd_alloc() relies on its memcpy() 
> to provide the kernel mappings.  After the last user PMD is allocated, 
> you still need to copy the kernel-shared part of it in.

See the bit about rounding up. Then again, the pmd entries don't get
filled in by any of that...


Bill
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC][PATCH] allow bigger PAGE_OFFSET with PAE
  2003-01-08 22:05     ` William Lee Irwin III
@ 2003-01-08 22:44       ` William Lee Irwin III
  0 siblings, 0 replies; 5+ messages in thread
From: William Lee Irwin III @ 2003-01-08 22:44 UTC (permalink / raw)
  To: Dave Hansen, Linux Kernel Mailing List, linux-mm

On Tue, Jan 07, 2003 at 12:06:38PM -0800, Dave Hansen wrote:
>>>> Also, this gets the kernel's pagetables right, but neglects 
>>>> userspace's for now.  pgd_alloc() needs to be fixed to allocate 
>>>> another PMD, if the split isn't PMD-alighed.

William Lee Irwin III wrote:
>>> Um, that should be automatic when USER_PTRS_PER_PGD is increased.

On Wed, Jan 08, 2003 at 01:04:23PM -0800, Dave Hansen wrote:
>> Nope, you need a little bit more.  pgd_alloc() relies on its memcpy() 
>> to provide the kernel mappings.  After the last user PMD is allocated, 
>> you still need to copy the kernel-shared part of it in.

On Wed, Jan 08, 2003 at 02:05:33PM -0800, William Lee Irwin III wrote:
> See the bit about rounding up. Then again, the pmd entries don't get
> filled in by any of that...

Okay, basically:

#define __USER_PTRS_PER_PGD	(TASK_SIZE/PGDIR_SIZE)
#define PARTIAL_PGD	(TASK_SIZE > __USER_PTRS_PER_PGD*PGDIR_SIZE ? 1 : 0)
#define PARTIAL_PMD	((TASK_SIZE % PGDIR_SIZE)/PMD_SIZE)
#define USER_PTRS_PER_PGD	(PARTIAL_PGD + __USER_PTRS_PER_PGD)

then


pgd_t *pgd_alloc(struct mm_struct *mm)
{
	int i;
	pgd_t *pgd = kmem_cache_alloc(pae_pgd_cachep, GFP_KERNEL);

	if (pgd) {
		for (i = 0; i < USER_PTRS_PER_PGD; i++) {
			unsigned long pmd = __get_free_page(GFP_KERNEL);
			if (!pmd)
				goto out_oom;
			clear_page(pmd);
			set_pgd(pgd + i, __pgd(1 + __pa(pmd)));
		}

		if (USER_PTRS_PER_PGD < PTRS_PER_PGD)
			memcpy(pgd + USER_PTRS_PER_PGD,
				swapper_pg_dir + USER_PTRS_PER_PGD,
				(PTRS_PER_PGD-USER_PTRS_PER_PGD)*sizeof(pgd_t));

		if (PARTIAL_PGD) {
			pgd_t *kpgd, *upgd;
			pmd_t *kpmd, *upmd;

			kpgd = pgd_offset_k(TASK_SIZE);
			upgd = pgd_offset(mm, TASK_SIZE);
			kpmd = pmd_offset(kpgd, TASK_SIZE);
			upmd = pmd_offset(upgd, TASK_SIZE);

			memcpy(upmd, kpmd, (PTRS_PER_PMD-PARTIAL_PMD)*sizeof(pmd_t));

		}
	}
	return pgd;
out_oom:
	for (i--; i >= 0; i--)
		free_page((unsigned long)__va(pgd_val(pgd[i])-1));
	kmem_cache_free(pae_pgd_cachep, pgd);
	return NULL;
}

etc.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2003-01-08 22:44 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-01-07 20:06 [RFC][PATCH] allow bigger PAGE_OFFSET with PAE Dave Hansen
2003-01-07 23:37 ` William Lee Irwin III
2003-01-08 21:04   ` Dave Hansen
2003-01-08 22:05     ` William Lee Irwin III
2003-01-08 22:44       ` William Lee Irwin III

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox