1) panic log [ 21.391300] ------------[ cut here ]------------ [ 21.393048] kernel BUG at ../fs/userfaultfd.c:385! [ 21.394914] invalid opcode: 0000 [#1] SMP PTI [ 21.396552] CPU: 6 PID: 5572 Comm: syz-executor.5 Not tainted 5.3.0-rc4 #40 [ 21.396554] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 [ 21.396564] RIP: 0010:handle_userfault+0x615/0x6b0 [ 21.396566] Code: c3 e9 ed fc ff ff 48 39 84 24 a0 00 00 00 0f 85 1a fe ff ff e9 69 fe ff ff e8 f7 28 d8 ff 0f 0b 0f 0b 0f 0b 90 e9 71 fa ff ff <0f> 0b bd 00 01 00 00 e9 29 fa ff ff a8 08 75 49 48 c7 c7 e0 1a e5 [ 21.396568] RSP: 0018:ffffc90002d8b9a0 EFLAGS: 00010287 [ 21.396570] RAX: ffff88841bd9d640 RBX: ffffc90002d8baa8 RCX: 00000000ebeaed2d [ 21.396571] RDX: 0000000000000100 RSI: 0000000000000200 RDI: ffffc90002d8baa8 [ 21.396573] RBP: ffff8884212befe8 R08: ffff88841ba64ab8 R09: 00000000fffffff0 [ 21.396574] R10: 0000000000000000 R11: 0000000000000000 R12: ffff88841ef4bc00 [ 21.396575] R13: ffff88841b782000 R14: ffff888107d0f000 R15: ffff88841bd9d640 [ 21.396577] FS: 00007f3addf2b700(0000) GS:ffff88842fb00000(0000) knlGS:0000000000000000 [ 21.396578] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 21.396579] CR2: 0000000020ffd000 CR3: 000000041b6c2005 CR4: 00000000000206e0 [ 21.396584] Call Trace: [ 21.396594] ? __lock_acquire+0x44a/0x10d0 [ 21.396599] ? trace_hardirqs_on_thunk+0x1a/0x20 [ 21.396606] ? find_held_lock+0x31/0xa0 [ 21.396612] ? __handle_mm_fault+0xfc2/0x1140 [ 21.396617] __handle_mm_fault+0xfcf/0x1140 [ 21.396625] handle_mm_fault+0x18d/0x390 [ 21.438653] ? handle_mm_fault+0x46/0x390 [ 21.438660] __do_page_fault+0x250/0x4e0 [ 21.440872] do_page_fault+0x31/0x210 [ 21.440878] async_page_fault+0x43/0x50 [ 21.440881] RIP: 0010:copy_user_handle_tail+0x2/0x10 [ 21.440884] Code: c3 0f 1f 80 00 00 00 00 66 66 90 83 fa 40 0f 82 70 ff ff ff 89 d1 f3 a4 31 c0 66 66 90 c3 66 2e 0f 1f 84 00 00 00 00 00 89 d1 a4 89 c8 66 66 90 c3 66 0f 1f 44 00 00 66 66 90 83 fa 08 0f 82 [ 21.448268] RSP: 0018:ffffc90002d8bcc0 EFLAGS: 00010246 [ 21.448271] RAX: 0000000020ffe000 RBX: 0000000020ffd000 RCX: 0000000000001000 [ 21.448272] RDX: 0000000000001000 RSI: 0000000020ffd000 RDI: ffff88842140f000 [ 21.448273] RBP: 0000000000001000 R08: 0000000000000001 R09: 0000000000000000 [ 21.448274] R10: 0000000000000000 R11: 0000000000000000 R12: ffff88842140f000 [ 21.448275] R13: ffff88841b6aa838 R14: ffff88841ba63e00 R15: fffffffffffffffe [ 21.448287] _copy_from_user+0x69/0xa0 [ 21.458060] mcopy_atomic+0x80f/0xc30 [ 21.458066] ? find_held_lock+0x31/0xa0 [ 21.459947] userfaultfd_ioctl+0x2f6/0x1290 [ 21.459952] ? __lock_acquire+0x44a/0x10d0 [ 21.459954] ? __lock_acquire+0x44a/0x10d0 [ 21.459959] do_vfs_ioctl+0xa6/0x6f0 [ 21.459964] ksys_ioctl+0x60/0x90 [ 21.464350] __x64_sys_ioctl+0x16/0x20 [ 21.464355] do_syscall_64+0x5a/0x270 [ 21.464361] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 21.467413] RIP: 0033:0x458c59 [ 21.467416] Code: ad b8 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 7b b8 fb ff c3 66 2e 0f 1f 84 00 00 00 00 [ 21.467417] RSP: 002b:00007f3addf2ac78 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [ 21.467419] RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000458c59 [ 21.467420] RDX: 0000000020000000 RSI: 00000000c028aa03 RDI: 0000000000000003 [ 21.467421] RBP: 000000000073c040 R08: 0000000000000000 R09: 0000000000000000 [ 21.467422] R10: 0000000000000000 R11: 0000000000000246 R12: 00007f3addf2b6d4 [ 21.467423] R13: 00000000004c34cf R14: 00000000004d6958 R15: 00000000ffffffff [ 21.467429] Modules linked in: [ 21.482179] Dumping ftrace buffer: [ 21.482182] (ftrace buffer empty) [ 21.482223] ---[ end trace 5be423d27d99858f ]--- [ 21.485013] RIP: 0010:handle_userfault+0x615/0x6b0 [ 21.485016] Code: c3 e9 ed fc ff ff 48 39 84 24 a0 00 00 00 0f 85 1a fe ff ff e9 69 fe ff ff e8 f7 28 d8 ff 0f 0b 0f 0b 0f 0b 90 e9 71 fa ff ff <0f> 0b bd 00 01 00 00 e9 29 fa ff ff a8 08 75 49 48 c7 c7 e0 1a e5 [ 21.485018] RSP: 0018:ffffc90002d8b9a0 EFLAGS: 00010287 [ 21.491200] RAX: ffff88841bd9d640 RBX: ffffc90002d8baa8 RCX: 00000000ebeaed2d [ 21.491201] RDX: 0000000000000100 RSI: 0000000000000200 RDI: ffffc90002d8baa8 [ 21.491202] RBP: ffff8884212befe8 R08: ffff88841ba64ab8 R09: 00000000fffffff0 [ 21.491203] R10: 0000000000000000 R11: 0000000000000000 R12: ffff88841ef4bc00 [ 21.491204] R13: ffff88841b782000 R14: ffff888107d0f000 R15: ffff88841bd9d640 [ 21.491206] FS: 00007f3addf2b700(0000) GS:ffff88842fb00000(0000) knlGS:0000000000000000 [ 21.491207] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 21.491208] CR2: 0000000020ffd000 CR3: 000000041b6c2005 CR4: 00000000000206e0 [ 21.491213] Kernel panic - not syncing: Fatal exception [ 21.491691] Dumping ftrace buffer: [ 21.491693] (ftrace buffer empty) [ 21.491694] Kernel Offset: disabled [ 21.506981] Rebooting in 86400 seconds.. 2) assembler dump (gdb) l * handle_userfault+0x615 0xffffffff8131dc05 is in handle_userfault (../fs/userfaultfd.c:379). 374 375 /* 376 * Coredumping runs without mmap_sem so we can only check that 377 * the mmap_sem is held, if PF_DUMPCORE was not set. 378 */ 379 WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem)); 380 381 ctx = vmf->vma->vm_userfaultfd_ctx.ctx; 382 if (!ctx) 383 goto out; (gdb) disassemble /m handle_userfault Dump of assembler code for function handle_userfault: 187 memset(msg, 0, sizeof(struct uffd_msg)); 0xffffffff8131d723 <+307>: movq $0x0,0x20(%rsp) 0xffffffff8131d73a <+330>: movq $0x0,0x28(%rsp) 0xffffffff8131d743 <+339>: movq $0x0,0x38(%rsp) 188 } 189 190 static inline struct uffd_msg userfault_msg(unsigned long address, 191 unsigned int flags, 192 unsigned long reason, 193 unsigned int features) 194 { 195 struct uffd_msg msg; 196 msg_init(&msg); 197 msg.event = UFFD_EVENT_PAGEFAULT; 0xffffffff8131d74c <+348>: movb $0x12,0x20(%rsp) 198 msg.arg.pagefault.address = address; 0xffffffff8131d751 <+353>: mov %rcx,0x30(%rsp) 199 if (flags & FAULT_FLAG_WRITE) 0xffffffff8131d72c <+316>: test $0x1,%al 0xffffffff8131d756 <+358>: je 0xffffffff8131d761 200 /* 201 * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the 202 * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE 203 * was not set in a UFFD_EVENT_PAGEFAULT, it means it 204 * was a read fault, otherwise if set it means it's 205 * a write fault. 206 */ 207 msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE; 0xffffffff8131d758 <+360>: movq $0x1,0x28(%rsp) 208 if (reason & VM_UFFD_WP) 0xffffffff8131d761 <+369>: test %r14b,%r14b ---Type to continue, or q to quit--- 0xffffffff8131d764 <+372>: je 0xffffffff8131d76c 209 /* 210 * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the 211 * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was 212 * not set in a UFFD_EVENT_PAGEFAULT, it means it was 213 * a missing fault, otherwise if set it means it's a 214 * write protect fault. 215 */ 216 msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP; 0xffffffff8131d766 <+374>: orq $0x2,0x28(%rsp) 217 if (features & UFFD_FEATURE_THREAD_ID) 0xffffffff8131d76c <+380>: and $0x1,%dh 0xffffffff8131d76f <+383>: je 0xffffffff8131d78a 218 msg.arg.pagefault.feat.ptid = task_pid_vnr(current); 0xffffffff8131d783 <+403>: mov %eax,0x38(%rsp) 0xffffffff8131d787 <+407>: mov 0x8(%rbx),%eax 219 return msg; 0xffffffff8131d78a <+410>: mov 0x20(%rsp),%rdi 0xffffffff8131d792 <+418>: mov 0x28(%rsp),%rsi 0xffffffff8131d797 <+423>: mov 0x30(%rsp),%rcx 0xffffffff8131d79c <+428>: mov 0x38(%rsp),%rdx 0xffffffff8131d7bc <+460>: mov %rdi,0x40(%rsp) 0xffffffff8131d7cc <+476>: mov %rsi,0x48(%rsp) 0xffffffff8131d7d1 <+481>: mov %rcx,0x50(%rsp) 0xffffffff8131d7d6 <+486>: mov %rdx,0x58(%rsp) 220 } 221 222 #ifdef CONFIG_HUGETLB_PAGE 223 /* 224 * Same functionality as userfaultfd_must_wait below with modifications for 225 * hugepmd ranges. 226 */ ---Type to continue, or q to quit--- 227 static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, 228 struct vm_area_struct *vma, 229 unsigned long address, 230 unsigned long flags, 231 unsigned long reason) 232 { 233 struct mm_struct *mm = ctx->mm; 234 pte_t *ptep, pte; 235 bool ret = true; 0xffffffff8131d8c9 <+729>: mov $0x1,%ebx 236 237 VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); 0xffffffff8131da7c <+1164>: test %rdx,%rdx 0xffffffff8131da7f <+1167>: je 0xffffffff8131dc78 0xffffffff8131da85 <+1173>: mov %rsi,(%rsp) 0xffffffff8131dc7a <+1674>: ud2 238 239 ptep = huge_pte_offset(mm, address, vma_mmu_pagesize(vma)); 0xffffffff8131da89 <+1177>: callq 0xffffffff8127de70 0xffffffff8131da8e <+1182>: mov (%rsp),%rsi 0xffffffff8131da92 <+1186>: mov %rax,%rdx 0xffffffff8131da95 <+1189>: mov %rbx,%rdi 0xffffffff8131da98 <+1192>: callq 0xffffffff8127fbd0 240 241 if (!ptep) 0xffffffff8131da9d <+1197>: test %rax,%rax 0xffffffff8131daa0 <+1200>: je 0xffffffff8131d8c9 242 goto out; 243 244 ret = false; 245 pte = huge_ptep_get(ptep); 246 247 /* ---Type to continue, or q to quit--- 248 * Lockless access: we're in a wait_event so it's ok if it 249 * changes under us. 250 */ 251 if (huge_pte_none(pte)) 0xffffffff8131daa9 <+1209>: test $0xffffffffffffff9f,%rax 0xffffffff8131daaf <+1215>: sete %bl 252 ret = true; 253 if (!huge_pte_write(pte) && (reason & VM_UFFD_WP)) 0xffffffff8131dab2 <+1218>: test $0x2,%al 0xffffffff8131dab4 <+1220>: jne 0xffffffff8131d8ce 254 ret = true; 0xffffffff8131daba <+1226>: test %r14b,%r14b 0xffffffff8131dabd <+1229>: mov $0x1,%eax 0xffffffff8131dac2 <+1234>: cmovne %eax,%ebx 0xffffffff8131dac5 <+1237>: jmpq 0xffffffff8131d8ce 255 out: 256 return ret; 257 } 258 #else 259 static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, 260 struct vm_area_struct *vma, 261 unsigned long address, 262 unsigned long flags, 263 unsigned long reason) 264 { 265 return false; /* should never get here */ 266 } 267 #endif /* CONFIG_HUGETLB_PAGE */ 268 269 /* 270 * Verify the pagetables are still not ok after having reigstered into 271 * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any 272 * userfault that has already been resolved, if userfaultfd_read and 273 * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different ---Type to continue, or q to quit--- 274 * threads. 275 */ 276 static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, 277 unsigned long address, 278 unsigned long flags, 279 unsigned long reason) 280 { 281 struct mm_struct *mm = ctx->mm; 282 pgd_t *pgd; 283 p4d_t *p4d; 284 pud_t *pud; 285 pmd_t *pmd, _pmd; 286 pte_t *pte; 287 bool ret = true; 288 289 VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); 0xffffffff8131d8a2 <+690>: test %rax,%rax 0xffffffff8131d8a5 <+693>: je 0xffffffff8131dc7a 290 291 pgd = pgd_offset(mm, address); 0xffffffff8131d8ab <+699>: mov %rcx,%rax 0xffffffff8131d8ae <+702>: mov 0x50(%rdx),%rdx 0xffffffff8131d8b2 <+706>: shr $0x27,%rax 0xffffffff8131d8b6 <+710>: and $0x1ff,%eax 0xffffffff8131d8bb <+715>: mov (%rdx,%rax,8),%rdi 292 if (!pgd_present(*pgd)) 293 goto out; 294 p4d = p4d_offset(pgd, address); 295 if (!p4d_present(*p4d)) 0xffffffff8131d8bf <+719>: test $0x1,%dil 0xffffffff8131d8c3 <+723>: jne 0xffffffff8131dae9 296 goto out; 297 pud = pud_offset(p4d, address); 298 if (!pud_present(*pud)) ---Type to continue, or q to quit--- 0xffffffff8131db40 <+1360>: test $0x1,%al 0xffffffff8131db42 <+1362>: je 0xffffffff8131d8c9 299 goto out; 300 pmd = pmd_offset(pud, address); 301 /* 302 * READ_ONCE must function as a barrier with narrower scope 303 * and it must be equivalent to: 304 * _pmd = *pmd; barrier(); 305 * 306 * This is to deal with the instability (as in 307 * pmd_trans_unstable) of the pmd. 308 */ 309 _pmd = READ_ONCE(*pmd); 310 if (pmd_none(_pmd)) 0xffffffff8131db7a <+1418>: test $0xffffffffffffff9f,%rdi 0xffffffff8131db81 <+1425>: je 0xffffffff8131d8c9 311 goto out; 312 313 ret = false; 0xffffffff8131db9c <+1452>: xor %ebx,%ebx 314 if (!pmd_present(_pmd)) 0xffffffff8131dba4 <+1460>: test $0x181,%eax 0xffffffff8131dba9 <+1465>: je 0xffffffff8131d8ce 315 goto out; 316 317 if (pmd_trans_huge(_pmd)) 318 goto out; 319 320 /* 321 * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it 322 * and use the standard pte_offset_map() instead of parsing _pmd. 323 */ 324 pte = pte_offset_map(pmd, address); ---Type to continue, or q to quit--- 325 /* 326 * Lockless access: we're in a wait_event so it's ok if it 327 * changes under us. 328 */ 329 if (pte_none(*pte)) 0xffffffff8131dbd1 <+1505>: testq $0xffffffffffffff9f,(%rcx,%r8,1) 0xffffffff8131dbd9 <+1513>: sete %bl 0xffffffff8131dbdc <+1516>: jmpq 0xffffffff8131d8ce 330 ret = true; 331 pte_unmap(pte); 332 333 out: 334 return ret; 335 } 336 337 /* 338 * The locking rules involved in returning VM_FAULT_RETRY depending on 339 * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and 340 * FAULT_FLAG_KILLABLE are not straightforward. The "Caution" 341 * recommendation in __lock_page_or_retry is not an understatement. 342 * 343 * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_sem must be released 344 * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is 345 * not set. 346 * 347 * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not 348 * set, VM_FAULT_RETRY can still be returned if and only if there are 349 * fatal_signal_pending()s, and the mmap_sem must be released before 350 * returning it. 351 */ 352 vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) 353 { 0xffffffff8131d5f0 <+0>: callq 0xffffffff81a01b30 <__fentry__> 0xffffffff8131d5f5 <+5>: push %r15 0xffffffff8131d600 <+16>: push %r14 0xffffffff8131d602 <+18>: push %r13 ---Type to continue, or q to quit--- 0xffffffff8131d604 <+20>: push %r12 0xffffffff8131d606 <+22>: push %rbp 0xffffffff8131d607 <+23>: push %rbx 0xffffffff8131d608 <+24>: mov %rdi,%rbx 0xffffffff8131d60b <+27>: sub $0xc0,%rsp 0xffffffff8131d612 <+34>: mov %gs:0x28,%rax 0xffffffff8131d61b <+43>: mov %rax,0xb8(%rsp) 0xffffffff8131d623 <+51>: xor %eax,%eax 354 struct mm_struct *mm = vmf->vma->vm_mm; 0xffffffff8131d62c <+60>: mov (%rdi),%rax 0xffffffff8131d62f <+63>: mov 0x40(%rax),%r12 355 struct userfaultfd_ctx *ctx; 0xffffffff8131d890 <+672>: mov 0x160(%r13),%rdx 0xffffffff8131da6e <+1150>: mov 0x160(%r13),%rbx 356 struct userfaultfd_wait_queue uwq; 357 vm_fault_t ret = VM_FAULT_SIGBUS; 0xffffffff8131d635 <+69>: mov $0x2,%ebp 358 bool must_wait, return_to_userland; 359 long blocking_state; 360 361 /* 362 * We don't do userfault handling for the final child pid update. 363 * 364 * We also don't do userfault handling during 365 * coredumping. hugetlbfs has the special 366 * follow_hugetlb_page() to skip missing pages in the 367 * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with 368 * the no_page_table() helper in follow_page_mask(), but the 369 * shmem_vm_ops->fault method is invoked even during 370 * coredumping without mmap_sem and it ends up here. 371 */ 372 if (current->flags & (PF_EXITING|PF_DUMPCORE)) 0xffffffff8131d625 <+53>: testl $0x204,0x24(%rdx) ---Type to continue, or q to quit--- 0xffffffff8131d633 <+67>: je 0xffffffff8131d665 373 goto out; 374 375 /* 376 * Coredumping runs without mmap_sem so we can only check that 377 * the mmap_sem is held, if PF_DUMPCORE was not set. 378 */ 379 WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem)); 0xffffffff8131d66d <+125>: test %rdx,%rdx 0xffffffff8131d670 <+128>: je 0xffffffff8131dbfd 0xffffffff8131dc00 <+1552>: jmpq 0xffffffff8131d676 0xffffffff8131dc05 <+1557>: ud2 380 381 ctx = vmf->vma->vm_userfaultfd_ctx.ctx; 0xffffffff8131d676 <+134>: mov 0xc0(%rax),%r13 382 if (!ctx) 0xffffffff8131d67d <+141>: test %r13,%r13 0xffffffff8131d680 <+144>: je 0xffffffff8131d635 383 goto out; 384 385 BUG_ON(ctx->mm != mm); 0xffffffff8131d682 <+146>: cmp 0x160(%r13),%r12 0xffffffff8131d689 <+153>: jne 0xffffffff8131dc05 386 387 VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP)); 0xffffffff8131d68f <+159>: test $0xffffffffffffedff,%rsi 0xffffffff8131d696 <+166>: jne 0xffffffff8131dbfb 0xffffffff8131dbfd <+1549>: ud2 0xffffffff8131dbff <+1551>: nop 388 VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP)); 0xffffffff8131d69c <+172>: mov %rsi,%r14 ---Type to continue, or q to quit--- 0xffffffff8131d69f <+175>: shr $0x9,%rsi 0xffffffff8131d6a3 <+179>: shr $0xc,%r14 0xffffffff8131d6a7 <+183>: xor $0x1,%rsi 0xffffffff8131d6ab <+187>: and $0x1,%r14d 0xffffffff8131d6af <+191>: and $0x1,%esi 0xffffffff8131d6b2 <+194>: cmp %sil,%r14b 0xffffffff8131d6b5 <+197>: jne 0xffffffff8131dbf9 0xffffffff8131dbfb <+1547>: ud2 389 390 if (ctx->features & UFFD_FEATURE_SIGBUS) 0xffffffff8131d6bb <+203>: testb $0x80,0x150(%r13) 0xffffffff8131d6c3 <+211>: jne 0xffffffff8131d635 391 goto out; 392 393 /* 394 * If it's already released don't get it. This avoids to loop 395 * in __get_user_pages if userfaultfd_release waits on the 396 * caller of handle_userfault to release the mmap_sem. 397 */ 398 if (unlikely(READ_ONCE(ctx->released))) { 0xffffffff8131d6d1 <+225>: test %al,%al 0xffffffff8131d6d3 <+227>: jne 0xffffffff8131dc07 399 /* 400 * Don't return VM_FAULT_SIGBUS in this case, so a non 401 * cooperative manager can close the uffd after the 402 * last UFFDIO_COPY, without risking to trigger an 403 * involuntary SIGBUS if the process was starting the 404 * userfaultfd while the userfaultfd was still armed 405 * (but after the last UFFDIO_COPY). If the uffd 406 * wasn't already closed when the userfault reached 407 * this point, that would normally be solved by 408 * userfaultfd_must_wait returning 'false'. 409 * 410 * If we were to return VM_FAULT_SIGBUS here, the non ---Type to continue, or q to quit--- 411 * cooperative manager would be instead forced to 412 * always call UFFDIO_UNREGISTER before it can safely 413 * close the uffd. 414 */ 415 ret = VM_FAULT_NOPAGE; 0xffffffff8131dc07 <+1559>: mov $0x100,%ebp 0xffffffff8131dc0c <+1564>: jmpq 0xffffffff8131d63a 416 goto out; 417 } 418 419 /* 420 * Check that we can return VM_FAULT_RETRY. 421 * 422 * NOTE: it should become possible to return VM_FAULT_RETRY 423 * even if FAULT_FLAG_TRIED is set without leading to gup() 424 * -EBUSY failures, if the userfaultfd is to be extended for 425 * VM_UFFD_WP tracking and we intend to arm the userfault 426 * without first stopping userland access to the memory. For 427 * VM_UFFD_MISSING userfaults this is enough for now. 428 */ 429 if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) { 0xffffffff8131d6d9 <+233>: mov 0x8(%rbx),%eax 0xffffffff8131d6dc <+236>: test $0x4,%al 0xffffffff8131d6de <+238>: je 0xffffffff8131dc11 430 /* 431 * Validate the invariant that nowait must allow retry 432 * to be sure not to return SIGBUS erroneously on 433 * nowait invocations. 434 */ 435 BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT); 0xffffffff8131dc11 <+1569>: test $0x8,%al 0xffffffff8131dc13 <+1571>: jne 0xffffffff8131dc5e 436 #ifdef CONFIG_DEBUG_VM 437 if (printk_ratelimit()) { ---Type to continue, or q to quit--- 0xffffffff8131dc15 <+1573>: mov $0xffffffff81e51ae0,%rdi 0xffffffff8131dc1c <+1580>: callq 0xffffffff81121440 <__printk_ratelimit> 0xffffffff8131dc21 <+1585>: test %eax,%eax 0xffffffff8131dc23 <+1587>: je 0xffffffff8131d635 438 printk(KERN_WARNING 0xffffffff8131dc29 <+1593>: mov 0x8(%rbx),%esi 0xffffffff8131dc2c <+1596>: mov $0xffffffff820edf58,%rdi 0xffffffff8131dc33 <+1603>: xor %eax,%eax 0xffffffff8131dc35 <+1605>: callq 0xffffffff81124f44 439 "FAULT_FLAG_ALLOW_RETRY missing %x\n", 440 vmf->flags); 441 dump_stack(); 0xffffffff8131dc3a <+1610>: callq 0xffffffff818e44bd 0xffffffff8131dc3f <+1615>: jmpq 0xffffffff8131d635 442 } 443 #endif 444 goto out; 445 } 446 447 /* 448 * Handle nowait, not much to do other than tell it to retry 449 * and wait. 450 */ 451 ret = VM_FAULT_RETRY; 0xffffffff8131d6e6 <+246>: mov $0x400,%ebp 0xffffffff8131dc6e <+1662>: mov $0x400,%ebp 0xffffffff8131dc73 <+1667>: jmpq 0xffffffff8131d9c6 0xffffffff8131dc78 <+1672>: ud2 452 if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) 0xffffffff8131d6e4 <+244>: test $0x8,%al 0xffffffff8131d6eb <+251>: jne 0xffffffff8131d63a 453 goto out; ---Type to continue, or q to quit--- 454 455 /* take the reference before dropping the mmap_sem */ 456 userfaultfd_ctx_get(ctx); 0xffffffff8131d6f1 <+257>: mov %r13,%rdi 0xffffffff8131d6f4 <+260>: callq 0xffffffff8131b140 457 458 init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); 459 uwq.wq.private = current; 0xffffffff8131d702 <+274>: mov %rax,0x88(%rsp) 460 uwq.msg = userfault_msg(vmf->address, vmf->flags, reason, 0xffffffff8131d70a <+282>: mov 0x8(%rbx),%eax 0xffffffff8131d70d <+285>: mov 0x18(%rbx),%rcx 0xffffffff8131d711 <+289>: mov 0x150(%r13),%edx 0xffffffff8131d7c1 <+465>: mov %rdi,0x60(%rsp) 0xffffffff8131d7e1 <+497>: mov %rsi,0x68(%rsp) 0xffffffff8131d7e6 <+502>: mov %rcx,0x70(%rsp) 0xffffffff8131d7ef <+511>: mov %rdx,0x78(%rsp) 461 ctx->features); 462 uwq.ctx = ctx; 0xffffffff8131d7a8 <+440>: mov %r13,0xa8(%rsp) 463 uwq.waken = false; 0xffffffff8131d7b0 <+448>: movb $0x0,0xb0(%rsp) 464 465 return_to_userland = 0xffffffff8131d7a1 <+433>: cmp $0x50,%eax 0xffffffff8131d7a4 <+436>: sete %r15b 466 (vmf->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == 0xffffffff8131d78f <+415>: and $0x50,%eax 467 (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE); 468 blocking_state = return_to_userland ? TASK_INTERRUPTIBLE : ---Type to continue, or q to quit--- 0xffffffff8131d7b8 <+456>: cmp $0x1,%r15b 0xffffffff8131d7c9 <+473>: sbb %rbp,%rbp 0xffffffff8131d7db <+491>: and $0x101,%ebp 0xffffffff8131d7eb <+507>: add $0x1,%rbp 469 TASK_KILLABLE; 470 471 spin_lock_irq(&ctx->fault_pending_wqh.lock); 472 /* 473 * After the __add_wait_queue the uwq is visible to userland 474 * through poll/read(). 475 */ 476 __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq); 477 /* 478 * The smp_mb() after __set_current_state prevents the reads 479 * following the spin_unlock to happen before the list_add in 480 * __add_wait_queue. 481 */ 482 set_current_state(blocking_state); 0xffffffff8131d858 <+616>: mov %rbp,0x18(%rsp) 0xffffffff8131d85d <+621>: movq $0xffffffff8131d84f,0x1bc8(%rdx) 0xffffffff8131d868 <+632>: mov 0x18(%rsp),%rax 0xffffffff8131d86d <+637>: xchg %rax,0x10(%rdx) 0xffffffff8131d874 <+644>: mov %rax,0x18(%rsp) 0xffffffff8131d879 <+649>: mov 0x18(%rsp),%rax 483 spin_unlock_irq(&ctx->fault_pending_wqh.lock); 484 485 if (!is_vm_hugetlb_page(vmf->vma)) 0xffffffff8131d883 <+659>: mov (%rbx),%rdi 0xffffffff8131d886 <+662>: testb $0x40,0x52(%rdi) 0xffffffff8131d88a <+666>: jne 0xffffffff8131da6a 486 must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, 0xffffffff8131d897 <+679>: mov 0x18(%rbx),%rcx 487 reason); ---Type to continue, or q to quit--- 488 else 489 must_wait = userfaultfd_huge_must_wait(ctx, vmf->vma, 0xffffffff8131da6a <+1146>: mov 0x18(%rbx),%rsi 490 vmf->address, 491 vmf->flags, reason); 492 up_read(&mm->mmap_sem); 0xffffffff8131d8ce <+734>: add $0xa8,%r12 0xffffffff8131d8d5 <+741>: mov %r12,%rdi 0xffffffff8131d8d8 <+744>: callq 0xffffffff81109600 493 494 if (likely(must_wait && !READ_ONCE(ctx->released) && 0xffffffff8131d8dd <+749>: test %bl,%bl 0xffffffff8131d8df <+751>: je 0xffffffff8131dc6e 0xffffffff8131d8ed <+765>: test %al,%al 0xffffffff8131d8ef <+767>: jne 0xffffffff8131dc6e 0xffffffff8131d8f5 <+773>: test %r15b,%r15b 0xffffffff8131d8f8 <+776>: je 0xffffffff8131daca 0xffffffff8131d911 <+801>: xor $0x1,%rax 0xffffffff8131d915 <+805>: test %rax,%rax 0xffffffff8131d918 <+808>: je 0xffffffff8131dc6e 495 (return_to_userland ? !signal_pending(current) : 496 !fatal_signal_pending(current)))) { 497 wake_up_poll(&ctx->fd_wqh, EPOLLIN); 0xffffffff8131d91e <+814>: lea 0x90(%r13),%rdi 0xffffffff8131d925 <+821>: mov $0x1,%ecx 0xffffffff8131d92a <+826>: mov $0x1,%edx 0xffffffff8131d92f <+831>: mov $0x3,%esi 0xffffffff8131d934 <+836>: callq 0xffffffff810fc760 <__wake_up> 498 schedule(); 0xffffffff8131d939 <+841>: callq 0xffffffff81905ad0 499 ret |= VM_FAULT_MAJOR; 0xffffffff8131d9c1 <+977>: mov $0x404,%ebp ---Type to continue, or q to quit--- 500 501 /* 502 * False wakeups can orginate even from rwsem before 503 * up_read() however userfaults will wait either for a 504 * targeted wakeup on the specific uwq waitqueue from 505 * wake_userfault() or for signals or for uffd 506 * release. 507 */ 508 while (!READ_ONCE(uwq.waken)) { 0xffffffff8131d946 <+854>: test %al,%al 0xffffffff8131d948 <+856>: jne 0xffffffff8131d9c1 0xffffffff8131d96d <+893>: test %al,%al 0xffffffff8131d96f <+895>: jne 0xffffffff8131d9c1 509 /* 510 * This needs the full smp_store_mb() 511 * guarantee as the state write must be 512 * visible to other CPUs before reading 513 * uwq.waken from other CPUs. 514 */ 515 set_current_state(blocking_state); 0xffffffff8131d971 <+897>: movq $0xffffffff8131d971,0x1bc8(%rbx) 0xffffffff8131d97c <+908>: mov %rbp,0x20(%rsp) 0xffffffff8131d981 <+913>: mov 0x20(%rsp),%rax 0xffffffff8131d986 <+918>: xchg %rax,0x10(%rbx) 0xffffffff8131d98a <+922>: mov %rax,0x20(%rsp) 0xffffffff8131d98f <+927>: mov 0x20(%rsp),%rax 516 if (READ_ONCE(uwq.waken) || 0xffffffff8131d99c <+940>: test %al,%al 0xffffffff8131d99e <+942>: jne 0xffffffff8131d9c1 0xffffffff8131d9a8 <+952>: test %al,%al 0xffffffff8131d9aa <+954>: jne 0xffffffff8131d9c1 0xffffffff8131d9ac <+956>: test %r15b,%r15b 0xffffffff8131d9b2 <+962>: jne 0xffffffff8131d955 ---Type to continue, or q to quit--- 517 READ_ONCE(ctx->released) || 0xffffffff8131d955 <+869>: test $0x4,%al 0xffffffff8131d957 <+871>: jne 0xffffffff8131d9c1 0xffffffff8131d959 <+873>: nopl 0x0(%rax) 518 (return_to_userland ? signal_pending(current) : 519 fatal_signal_pending(current))) 520 break; 521 schedule(); 0xffffffff8131d960 <+880>: callq 0xffffffff81905ad0 522 } 523 } 524 525 __set_current_state(TASK_RUNNING); 0xffffffff8131d9d2 <+994>: movq $0xffffffff8131d9c6,0x1bc8(%rax) 0xffffffff8131d9dd <+1005>: movq $0x0,0x10(%rax) 526 527 if (return_to_userland) { 0xffffffff8131d9cf <+991>: test %r15b,%r15b 0xffffffff8131d9e5 <+1013>: je 0xffffffff8131d9f3 528 if (signal_pending(current) && 0xffffffff8131d9ea <+1018>: and $0x4,%edx 0xffffffff8131d9ed <+1021>: jne 0xffffffff8131dc44 529 !fatal_signal_pending(current)) { 530 /* 531 * If we got a SIGSTOP or SIGCONT and this is 532 * a normal userland page fault, just let 533 * userland return so the signal will be 534 * handled and gdb debugging works. The page 535 * fault code immediately after we return from 536 * this function is going to release the 537 * mmap_sem and it's not depending on it 538 * (unlike gup would if we were not to return ---Type to continue, or q to quit--- 539 * VM_FAULT_RETRY). 540 * 541 * If a fatal signal is pending we still take 542 * the streamlined VM_FAULT_RETRY failure path 543 * and there's no need to retake the mmap_sem 544 * in such case. 545 */ 546 down_read(&mm->mmap_sem); 0xffffffff8131dc4c <+1628>: mov %r12,%rdi 0xffffffff8131dc54 <+1636>: callq 0xffffffff8190a6a0 0xffffffff8131dc59 <+1641>: jmpq 0xffffffff8131d9f3 0xffffffff8131dc5e <+1646>: ud2 547 ret = VM_FAULT_NOPAGE; 0xffffffff8131dc4f <+1631>: mov $0x100,%ebp 548 } 549 } 550 551 /* 552 * Here we race with the list_del; list_add in 553 * userfaultfd_ctx_read(), however because we don't ever run 554 * list_del_init() to refile across the two lists, the prev 555 * and next pointers will never point to self. list_add also 556 * would never let any of the two pointers to point to 557 * self. So list_empty_careful won't risk to see both pointers 558 * pointing to self at any time during the list refile. The 559 * only case where list_del_init() is called is the full 560 * removal in the wake function and there we don't re-list_add 561 * and it's fine not to block on the spinlock. The uwq on this 562 * kernel stack can be released after the list_del_init. 563 */ 564 if (!list_empty_careful(&uwq.wq.entry)) { 565 spin_lock_irq(&ctx->fault_pending_wqh.lock); 566 /* 567 * No need of list_del_init(), the uwq on the stack 568 * will be freed shortly anyway. ---Type to continue, or q to quit--- 569 */ 570 list_del(&uwq.wq.entry); 571 spin_unlock_irq(&ctx->fault_pending_wqh.lock); 572 } 573 574 /* 575 * ctx may go away after this if the userfault pseudo fd is 576 * already released. 577 */ 578 userfaultfd_ctx_put(ctx); 0xffffffff8131da5d <+1133>: mov %r13,%rdi 0xffffffff8131da60 <+1136>: callq 0xffffffff8131b4e0 0xffffffff8131da65 <+1141>: jmpq 0xffffffff8131d63a 579 580 out: 581 return ret; 582 } 0xffffffff8131d63a <+74>: mov 0xb8(%rsp),%rcx 0xffffffff8131d642 <+82>: xor %gs:0x28,%rcx 0xffffffff8131d64b <+91>: mov %ebp,%eax 0xffffffff8131d64d <+93>: jne 0xffffffff8131dbf4 0xffffffff8131d653 <+99>: add $0xc0,%rsp 0xffffffff8131d65a <+106>: pop %rbx 0xffffffff8131d65b <+107>: pop %rbp 0xffffffff8131d65c <+108>: pop %r12 0xffffffff8131d65e <+110>: pop %r13 0xffffffff8131d660 <+112>: pop %r14 0xffffffff8131d662 <+114>: pop %r15 0xffffffff8131d664 <+116>: retq 0xffffffff8131dbf4 <+1540>: callq 0xffffffff810a04f0 <__stack_chk_fail> 0xffffffff8131dbf9 <+1545>: ud2 End of assembler dump.