Post

Platform Device

Kernel initialization before DeviceTree

Although we are not going to cover the details of the initialization procedure, this post will take a look at what happens before the device tree is initialized.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
        __HEAD
        /*
         * DO NOT MODIFY. Image header expected by Linux boot-loaders.
         */
        efi_signature_nop                       // special NOP to identity as PE/COFF executable
        b       primary_entry                   // branch to kernel start, magic
        .quad   0                               // Image load offset from start of RAM, little-endian
        le64sym _kernel_size_le                 // Effective size of kernel image, little-endian
        le64sym _kernel_flags_le                // Informative flags, little-endian
        .quad   0                               // reserved
        .quad   0                               // reserved
        .quad   0                               // reserved
        .ascii  ARM64_IMAGE_MAGIC               // Magic number
        .long   .Lpe_header_offset              // Offset to the PE header.

        __EFI_PE_HEADER

        __INIT

        /*
         * The following callee saved general purpose registers are used on the
         * primary lowlevel boot path:
         *
         *  Register   Scope                      Purpose
         *  x21        primary_entry() .. start_kernel()        FDT pointer passed at boot in x0
         *  x23        primary_entry() .. start_kernel()        physical misalignment/KASLR offset
         *  x28        __create_page_tables()                   callee preserved temp register
         *  x19/x20    __primary_switch()                       callee preserved temp registers
         *  x24        __primary_switch() .. relocate_kernel()  current RELR displacement
         */

SYM_CODE_START(primary_entry)
        bl      preserve_boot_args
        bl      init_kernel_el                  // w0=cpu_boot_mode
        adrp    x23, __PHYS_OFFSET
        and     x23, x23, MIN_KIMG_ALIGN - 1    // KASLR offset, defaults to 0
        bl      set_cpu_boot_mode_flag
        bl      __create_page_tables
        /*
         * The following calls CPU setup code, see arch/arm64/mm/proc.S for
         * details.
         * On return, the CPU will be ready for the MMU to be turned on and
         * the TCR will have been set.
         */
        bl      __cpu_setup                     // initialise processor
        b       __primary_switch
SYM_CODE_END(primary_entry)

SYM_FUNC_START_LOCAL(__primary_switch)
#ifdef CONFIG_RANDOMIZE_BASE
        mov     x19, x0                         // preserve new SCTLR_EL1 value
        mrs     x20, sctlr_el1                  // preserve old SCTLR_EL1 value
#endif

        adrp    x1, init_pg_dir
        bl      __enable_mmu
#ifdef CONFIG_RELOCATABLE
#ifdef CONFIG_RELR
        mov     x24, #0                         // no RELR displacement yet
#endif
        bl      __relocate_kernel
#ifdef CONFIG_RANDOMIZE_BASE
        ldr     x8, =__primary_switched
        adrp    x0, __PHYS_OFFSET
        blr     x8

        /*
         * If we return here, we have a KASLR displacement in x23 which we need
         * to take into account by discarding the current kernel mapping and
         * creating a new one.
         */
        pre_disable_mmu_workaround
        msr     sctlr_el1, x20                  // disable the MMU
        isb
        bl      __create_page_tables            // recreate kernel mapping

        tlbi    vmalle1                         // Remove any stale TLB entries
        dsb     nsh

        msr     sctlr_el1, x19                  // re-enable the MMU
        isb
        ic      iallu                           // flush instructions fetched
        dsb     nsh                             // via old mapping
        isb

        bl      __relocate_kernel
#endif
#endif
        ldr     x8, =__primary_switched
        adrp    x0, __PHYS_OFFSET
        br      x8
SYM_FUNC_END(__primary_switch)

/*
 * The following fragment of code is executed with the MMU enabled.
 *
 *   x0 = __PHYS_OFFSET
 */
SYM_FUNC_START_LOCAL(__primary_switched)
        adrp    x4, init_thread_union
        add     sp, x4, #THREAD_SIZE
        adr_l   x5, init_task
        msr     sp_el0, x5                      // Save thread_info

#ifdef CONFIG_ARM64_PTR_AUTH
        __ptrauth_keys_init_cpu x5, x6, x7, x8
#endif

        adr_l   x8, vectors                     // load VBAR_EL1 with virtual
        msr     vbar_el1, x8                    // vector table address
        isb

        stp     xzr, x30, [sp, #-16]!
        mov     x29, sp

#ifdef CONFIG_SHADOW_CALL_STACK
        adr_l   scs_sp, init_shadow_call_stack  // Set shadow call stack
#endif

        str_l   x21, __fdt_pointer, x5          // Save FDT pointer

        ldr_l   x4, kimage_vaddr                // Save the offset between
        sub     x4, x4, x0                      // the kernel virtual and
        str_l   x4, kimage_voffset, x5          // physical mappings

        // Clear BSS
        adr_l   x0, __bss_start
        mov     x1, xzr
        adr_l   x2, __bss_stop
        sub     x2, x2, x0
        bl      __pi_memset
        dsb     ishst                           // Make zero page visible to PTW

#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
        bl      kasan_early_init
#endif
#ifdef CONFIG_RANDOMIZE_BASE
        tst     x23, ~(MIN_KIMG_ALIGN - 1)      // already running randomized?
        b.ne    0f
        mov     x0, x21                         // pass FDT address in x0
        bl      kaslr_early_init                // parse FDT for KASLR options
        cbz     x0, 0f                          // KASLR disabled? just proceed
        orr     x23, x23, x0                    // record KASLR offset
        ldp     x29, x30, [sp], #16             // we must enable KASLR, return
        ret                                     // to __primary_switch()
0:
#endif
        add     sp, sp, #16
        mov     x29, #0
        mov     x30, #0
        b       start_kernel
SYM_FUNC_END(__primary_switched)

After the processor has been initialized and sets up the execution environment, it jumps to the first C programmed function, start_kernel.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
{
        char *command_line;
        char *after_dashes;

        set_task_stack_end_magic(&init_task);
        smp_setup_processor_id();
        debug_objects_early_init();

        cgroup_init_early();

        local_irq_disable();
        early_boot_irqs_disabled = true;

        /*
         * Interrupts are still disabled. Do necessary setups, then
         * enable them.
         */
        boot_cpu_init();
        page_address_init();
        pr_notice("%s", linux_banner);
        early_security_init();
        setup_arch(&command_line);
        setup_boot_config(command_line);
        setup_command_line(command_line);
        setup_nr_cpu_ids();
        setup_per_cpu_areas();
        smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
        boot_cpu_hotplug_init();

        build_all_zonelists(NULL);
        page_alloc_init();

        pr_notice("Kernel command line: %s\n", saved_command_line);
        /* parameters may set static keys */
        jump_label_init();
        parse_early_param();
        after_dashes = parse_args("Booting kernel",
                                  static_command_line, __start___param,
                                  __stop___param - __start___param,
                                  -1, -1, NULL, &unknown_bootoption);
        if (!IS_ERR_OR_NULL(after_dashes))
                parse_args("Setting init args", after_dashes, NULL, 0, -1, -1,
                           NULL, set_init_arg);
        if (extra_init_args)
                parse_args("Setting extra init args", extra_init_args,
                           NULL, 0, -1, -1, NULL, set_init_arg);

        /*
         * These use large bootmem allocations and must precede
         * kmem_cache_init()
         */
        setup_log_buf(0);
        vfs_caches_init_early();
        sort_main_extable();
        trap_init();
        mm_init();

        ftrace_init();

        /* trace_printk can be enabled here */
        early_trace_init();

        /*
         * Set up the scheduler prior starting any interrupts (such as the
         * timer interrupt). Full topology setup happens at smp_init()
         * time - but meanwhile we still have a functioning scheduler.
         */
        sched_init();
        /*
         * Disable preemption - early bootup scheduling is extremely
         * fragile until we cpu_idle() for the first time.
         */
        preempt_disable();
        if (WARN(!irqs_disabled(),
                 "Interrupts were enabled *very* early, fixing it\n"))
                local_irq_disable();
        radix_tree_init();

        /*
         * Set up housekeeping before setting up workqueues to allow the unbound
         * workqueue to take non-housekeeping into account.
         */
        housekeeping_init();

        /*
         * Allow workqueue creation and work item queueing/cancelling
         * early.  Work item execution depends on kthreads and starts after
         * workqueue_init().
         */
        workqueue_init_early();

        rcu_init();

        /* Trace events are available after this */
        trace_init();

        if (initcall_debug)
                initcall_debug_enable();

        context_tracking_init();
        /* init some links before init_ISA_irqs() */
        early_irq_init();
        init_IRQ();
        tick_init();
        rcu_init_nohz();
        init_timers();
        hrtimers_init();
        softirq_init();
        timekeeping_init();

        /*
         * For best initial stack canary entropy, prepare it after:
         * - setup_arch() for any UEFI RNG entropy and boot cmdline access
         * - timekeeping_init() for ktime entropy used in rand_initialize()
         * - rand_initialize() to get any arch-specific entropy like RDRAND
         * - add_latent_entropy() to get any latent entropy
         * - adding command line entropy
         */
        rand_initialize();
        add_latent_entropy();
        add_device_randomness(command_line, strlen(command_line));
        boot_init_stack_canary();

        time_init();
        perf_event_init();
        profile_init();
        call_function_init();
        WARN(!irqs_disabled(), "Interrupts were enabled early\n");

        early_boot_irqs_disabled = false;
        local_irq_enable();

        kmem_cache_init_late();

        /*
         * HACK ALERT! This is early. We're enabling the console before
         * we've done PCI setups etc, and console_init() must be aware of
         * this. But we do want output early, in case something goes wrong.
         */
        console_init();
        if (panic_later)
                panic("Too many boot %s vars at `%s'", panic_later,
                      panic_param);

        lockdep_init();

        /*
         * Need to run this when irqs are enabled, because it wants
         * to self-test [hard/soft]-irqs on/off lock inversion bugs
         * too:
         */
        locking_selftest();

        /*
         * This needs to be called before any devices perform DMA
         * operations that might use the SWIOTLB bounce buffers. It will
         * mark the bounce buffers as decrypted so that their usage will
         * not cause "plain-text" data to be decrypted when accessed.
         */
        mem_encrypt_init();

#ifdef CONFIG_BLK_DEV_INITRD
        if (initrd_start && !initrd_below_start_ok &&
            page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) {
                pr_crit("initrd overwritten (0x%08lx < 0x%08lx) - disabling it.\n",
                    page_to_pfn(virt_to_page((void *)initrd_start)),
                    min_low_pfn);
                initrd_start = 0;
        }
#endif
        setup_per_cpu_pageset();
        numa_policy_init();
        acpi_early_init();
        if (late_time_init)
                late_time_init();
        sched_clock_init();
        calibrate_delay();
        pid_idr_init();
        anon_vma_init();
#ifdef CONFIG_X86
        if (efi_enabled(EFI_RUNTIME_SERVICES))
                efi_enter_virtual_mode();
#endif
        thread_stack_cache_init();
        cred_init();
        fork_init();
        proc_caches_init();
        uts_ns_init();
        key_init();
        security_init();
        dbg_late_init();
        vfs_caches_init();
        pagecache_init();
        signals_init();
        seq_file_init();
        proc_root_init();
        nsfs_init();
        cpuset_init();
        cgroup_init();
        taskstats_init_early();
        delayacct_init();

        poking_init();
        check_bugs();

        acpi_subsystem_init();
        arch_post_acpi_subsys_init();
        sfi_init_late();
        kcsan_init();

        /* Do the rest non-__init'ed, we're now alive */
        arch_call_rest_init();

        prevent_tail_call_optimization();
}

Although there are so many initialization function for kernel, what we have interest in now is the last init function, arch_call_rest_init which invokes rest_init function.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
void __init __weak arch_call_rest_init(void)
{
        rest_init();
}

/*
 * We need to finalize in a non-__init function or else race conditions
 * between the root thread and the init thread may cause start_kernel to
 * be reaped by free_initmem before the root thread has proceeded to
 * cpu_idle.
 *
 * gcc-3.4 accidentally inlines this function, so use noinline.
 */

static __initdata DECLARE_COMPLETION(kthreadd_done);

noinline void __ref rest_init(void)
{
        struct task_struct *tsk;
        int pid;

        rcu_scheduler_starting();
        /*
         * We need to spawn init first so that it obtains pid 1, however
         * the init task will end up wanting to create kthreads, which, if
         * we schedule it before we create kthreadd, will OOPS.
         */
        pid = kernel_thread(kernel_init, NULL, CLONE_FS);
        /*
         * Pin init on the boot CPU. Task migration is not properly working
         * until sched_init_smp() has been run. It will set the allowed
         * CPUs for init to the non isolated CPUs.
         */
        rcu_read_lock();
        tsk = find_task_by_pid_ns(pid, &init_pid_ns);
        set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id()));
        rcu_read_unlock();

        numa_default_policy();
        pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
        rcu_read_lock();
        kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
        rcu_read_unlock();

        /*
         * Enable might_sleep() and smp_processor_id() checks.
         * They cannot be enabled earlier because with CONFIG_PREEMPTION=y
         * kernel_thread() would trigger might_sleep() splats. With
         * CONFIG_PREEMPT_VOLUNTARY=y the init task might have scheduled
         * already, but it's stuck on the kthreadd_done completion.
         */
        system_state = SYSTEM_SCHEDULING;

        complete(&kthreadd_done);

        /*
         * The boot idle thread must execute schedule()
         * at least once to get things moving:
         */
        schedule_preempt_disabled();
        /* Call into cpu_idle with preempt disabled */
        cpu_startup_entry(CPUHP_ONLINE);
}

The most important thing of the rest_init function is spawning kernel_init thread as the first kernel thread. The spawned kernel thread is dedicated to run the first process as the kernel privilege, which is usally named */init.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
static int __ref kernel_init(void *unused)
{
        int ret;

        kernel_init_freeable();
        /* need to finish all async __init code before freeing the memory */
        async_synchronize_full();
        kprobe_free_init_mem();
        ftrace_free_init_mem();
        free_initmem();
        mark_readonly();

        /*
         * Kernel mappings are now finalized - update the userspace page-table
         * to finalize PTI.
         */
        pti_finalize();

        system_state = SYSTEM_RUNNING;
        numa_default_policy();

        rcu_end_inkernel_boot();

        do_sysctl_args();

        if (ramdisk_execute_command) {
                ret = run_init_process(ramdisk_execute_command);
                if (!ret)
                        return 0;
                pr_err("Failed to execute %s (error %d)\n",
                       ramdisk_execute_command, ret);
        }

        /*
         * We try each of these until one succeeds.
         *
         * The Bourne shell can be used instead of init if we are
         * trying to recover a really broken machine.
         */
        if (execute_command) {
                ret = run_init_process(execute_command);
                if (!ret)
                        return 0;
                panic("Requested init %s failed (error %d).",
                      execute_command, ret);
        }

        if (CONFIG_DEFAULT_INIT[0] != '\0') {
                ret = run_init_process(CONFIG_DEFAULT_INIT);
                if (ret)
                        pr_err("Default init %s failed (error %d)\n",
                               CONFIG_DEFAULT_INIT, ret);
                else
                        return 0;
        }

        if (!try_to_run_init_process("/sbin/init") ||
            !try_to_run_init_process("/etc/init") ||
            !try_to_run_init_process("/bin/init") ||
            !try_to_run_init_process("/bin/sh"))
                return 0;

        panic("No working init found.  Try passing init= option to kernel. "
              "See Linux Documentation/admin-guide/init.rst for guidance.");
}

Although the most important goal of kernel_init thread is executing the init process, but we will see only the parts that allow us to handle registration of the devices specified in the device tree. Before the init_thread actuall invokes the init process, it calls kernel_init_freeable function which actually handles the device registration.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
static noinline void __init kernel_init_freeable(void)
{
        /*
         * Wait until kthreadd is all set-up.
         */
        wait_for_completion(&kthreadd_done);

        /* Now the scheduler is fully set up and can do blocking allocations */
        gfp_allowed_mask = __GFP_BITS_MASK;

        /*
         * init can allocate pages on any node
         */
        set_mems_allowed(node_states[N_MEMORY]);

        cad_pid = task_pid(current);

        smp_prepare_cpus(setup_max_cpus);

        workqueue_init();

        init_mm_internals();

        rcu_init_tasks_generic();
        do_pre_smp_initcalls();
        lockup_detector_init();

        smp_init();
        sched_init_smp();

        padata_init();
        page_alloc_init_late();
        /* Initialize page ext after all struct pages are initialized. */
        page_ext_init();

        do_basic_setup();

        kunit_run_all_tests();

        console_on_rootfs();

        /*
         * check if there is an early userspace init.  If yes, let it do all
         * the work
         */
        if (init_eaccess(ramdisk_execute_command) != 0) {
                ramdisk_execute_command = NULL;
                prepare_namespace();
        }

        /*
         * Ok, we have completed the initial bootup, and
         * we're essentially up and running. Get rid of the
         * initmem segments and start the user-mode stuff..
         *
         * rootfs is available now, try loading the public keys
         * and default modules
         */


        integrity_load_keys();
}

/*
 * Ok, the machine is now initialized. None of the devices
 * have been touched yet, but the CPU subsystem is up and
 * running, and memory and process management works.
 *
 * Now we can finally start doing some real work..
 */
static void __init do_basic_setup(void)
{
        cpuset_init_smp();
        driver_init();
        init_irq_proc();
        do_ctors();
        usermodehelper_enable();
        do_initcalls();
}

Among the multiple initializations done by the do_basic_setup, driver_init and do_initcalls are strongly related to platform_device and device tree parsing. Here device tree parsing means that register the devices specified in the device tree to the kernel driver system.

Initialization of driver subsystem

Before we assign the devices and bind the driver associated with, we have to initialize the driver subsystem of the kernel.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
/**
 * driver_init - initialize driver model.
 *
 * Call the driver model init functions to initialize their
 * subsystems. Called early from init/main.c.
 */
void __init driver_init(void)
{
        /* These are the core pieces */
        devtmpfs_init();
        devices_init();
        buses_init();
        classes_init();
        firmware_init();
        hypervisor_init();

        /* These are also core pieces, but must come after the
         * core core pieces.
         */
        of_core_init();
        platform_bus_init();
        cpu_dev_init();
        memory_dev_init();
        container_dev_init();
}

int __init devices_init(void)
{
        devices_kset = kset_create_and_add("devices", &device_uevent_ops, NULL);
        if (!devices_kset)
                return -ENOMEM;
        dev_kobj = kobject_create_and_add("dev", NULL);
        if (!dev_kobj)
                goto dev_kobj_err;
        sysfs_dev_block_kobj = kobject_create_and_add("block", dev_kobj);
        if (!sysfs_dev_block_kobj)
                goto block_kobj_err;
        sysfs_dev_char_kobj = kobject_create_and_add("char", dev_kobj);
        if (!sysfs_dev_char_kobj)
                goto char_kobj_err;

        return 0;

 char_kobj_err:
        kobject_put(sysfs_dev_block_kobj);
 block_kobj_err:
        kobject_put(dev_kobj);
 dev_kobj_err:
        kset_unregister(devices_kset);
        return -ENOMEM;
}

int __init buses_init(void)
{
        bus_kset = kset_create_and_add("bus", &bus_uevent_ops, NULL);
        if (!bus_kset)
                return -ENOMEM;

        system_kset = kset_create_and_add("system", NULL, &devices_kset->kobj);
        if (!system_kset)
                return -ENOMEM;

        return 0;
}
struct kobject *firmware_kobj;
EXPORT_SYMBOL_GPL(firmware_kobj);
 
int __init firmware_init(void)
{       
        firmware_kobj = kobject_create_and_add("firmware", NULL);
        if (!firmware_kobj)
                return -ENOMEM;
        return 0;
}  

Most of the init functions are about initializing driver related structures to manage device, driver, bus, etc which represent resources that can be registered on the driver sub-system. Those resources are managed with the kset and kobjects. For example, devices_init function allocates kset for managing all device resource registered to the system. It has root kobject, dev, and block and char devices are manages ad its children resource. Other init functions are mostly same, allocating kset and kobjects associated with specific resources used in driver sub-system.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
void __init of_core_init(void)
{
        struct device_node *np;


        /* Create the kset, and register existing nodes */
        mutex_lock(&of_mutex);
        of_kset = kset_create_and_add("devicetree", NULL, firmware_kobj);
        if (!of_kset) {
                mutex_unlock(&of_mutex);
                pr_err("failed to register existing nodes\n");
                return;
        }
        for_each_of_allnodes(np) {
                __of_attach_node_sysfs(np);
                if (np->phandle && !phandle_cache[of_phandle_cache_hash(np->phandle)])
                        phandle_cache[of_phandle_cache_hash(np->phandle)] = np;
        }
        mutex_unlock(&of_mutex);

        /* Symlink in /proc as required by userspace ABI */
        if (of_root)
                proc_symlink("device-tree", NULL, "/sys/firmware/devicetree/base");
}

Similar to other init functions, of_core_init function also generates kset for devicetree. In addition to this, it traverse entire device_node of the device tree and generate hash of the phandle of each device node.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
struct bus_type platform_bus_type = {
        .name           = "platform",
        .dev_groups     = platform_dev_groups,
        .match          = platform_match,
        .uevent         = platform_uevent,
        .probe          = platform_probe,
        .remove         = platform_remove,
        .shutdown       = platform_shutdown,
        .dma_configure  = platform_dma_configure,
        .pm             = &platform_dev_pm_ops,
};
EXPORT_SYMBOL_GPL(platform_bus_type);

int __init platform_bus_init(void)
{
        int error;

        early_platform_cleanup();

        error = device_register(&platform_bus);
        if (error) {
                put_device(&platform_bus);
                return error;
        }
        error =  bus_register(&platform_bus_type);
        if (error)
                device_unregister(&platform_bus);
        of_platform_register_reconfig_notifier();
        return error;
}

Now finally we have some more complex routines that XXX

Registering bus device

drivers/base/core.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
/**     
 * device_register - register a device with the system.
 * @dev: pointer to the device structure
 *
 * This happens in two clean steps - initialize the device
 * and add it to the system. The two steps can be called
 * separately, but this is the easiest and most common.
 * I.e. you should only call the two helpers separately if
 * have a clearly defined need to use and refcount the device
 * before it is added to the hierarchy.
 *
 * For more information, see the kerneldoc for device_initialize()
 * and device_add().    
 *      
 * NOTE: _Never_ directly free @dev after calling this function, even
 * if it returned an error! Always use put_device() to give up the
 * reference initialized in this function instead.
 */     
int device_register(struct device *dev)
{       
        device_initialize(dev);
        return device_add(dev);
}
EXPORT_SYMBOL_GPL(device_register);
/**
 * device_initialize - init device structure.
 * @dev: device.
 *
 * This prepares the device for use by other layers by initializing
 * its fields.
 * It is the first half of device_register(), if called by
 * that function, though it can also be called separately, so one
 * may use @dev's fields. In particular, get_device()/put_device()
 * may be used for reference counting of @dev after calling this
 * function.
 * 
 * All fields in @dev must be initialized by the caller to 0, except
 * for those explicitly set to some other value.  The simplest
 * approach is to use kzalloc() to allocate the structure containing
 * @dev.
 *
 * NOTE: Use put_device() to give up your reference instead of freeing
 * @dev directly once you have called this function.
 */     
void device_initialize(struct device *dev)
{
        dev->kobj.kset = devices_kset;
        kobject_init(&dev->kobj, &device_ktype);
        INIT_LIST_HEAD(&dev->dma_pools);
        mutex_init(&dev->mutex);
#ifdef CONFIG_PROVE_LOCKING
        mutex_init(&dev->lockdep_mutex);
#endif                  
        lockdep_set_novalidate_class(&dev->mutex);
        spin_lock_init(&dev->devres_lock);
        INIT_LIST_HEAD(&dev->devres_head);
        device_pm_init(dev);
        set_dev_node(dev, -1);
#ifdef CONFIG_GENERIC_MSI_IRQ
        INIT_LIST_HEAD(&dev->msi_list);
#endif
        INIT_LIST_HEAD(&dev->links.consumers);
        INIT_LIST_HEAD(&dev->links.suppliers);
        INIT_LIST_HEAD(&dev->links.defer_sync);
        dev->links.status = DL_DEV_NO_DRIVER;
}
EXPORT_SYMBOL_GPL(device_initialize);

The first part of the device_register function is device_initialize, which initialize the device structure passed to the device_register.

The most important part of the device_initialize is assign devices_kset to kset of the kobject of the initialized device. We will see later, but to add the kobject of the device, kobj->kset must be assigned before calling kobject_add function.

Also, to manage the device structure using kobject, its ktypes should be set by the kojbect_init function. The second parameter device_ktype is a kobj_type object that contains some callback functions. The most important one is release callback function that is going to be invoked when the reference count of the device structure (maintained by the kobject) becomes zero.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
static struct kobj_type device_ktype = {
        .release        = device_release,
        .sysfs_ops      = &dev_sysfs_ops,
        .namespace      = device_namespace,
        .get_ownership  = device_get_ownership,
};
/**
 * device_release - free device structure.
 * @kobj: device's kobject.
 *
 * This is called once the reference count for the object
 * reaches 0. We forward the call to the device's release
 * method, which should handle actually freeing the structure.
 */
static void device_release(struct kobject *kobj)
{
        struct device *dev = kobj_to_dev(kobj);
        struct device_private *p = dev->p;

        /*
         * Some platform devices are driven without driver attached
         * and managed resources may have been acquired.  Make sure
         * all resources are released.
         *
         * Drivers still can add resources into device after device
         * is deleted but alive, so release devres here to avoid
         * possible memory leak.
         */
        devres_release_all(dev);

        kfree(dev->dma_range_map);

        if (dev->release)
                dev->release(dev);
        else if (dev->type && dev->type->release)
                dev->type->release(dev);
        else if (dev->class && dev->class->dev_release)
                dev->class->dev_release(dev);
        else
                WARN(1, KERN_ERR "Device '%s' does not have a release() function, it is broken and must be fixed. See Documentation/core-api/kobject.rst.\n",
                        dev_name(dev));
        kfree(p);
}

When the device_relase function is invoked, it firstly free the all resources allocated for the device, devres_release_all(dev). After that, by invoking the release function registered in the device structure, it can release the device structure which is not used anymore.

Now, let’s take a look at rest of the device_initialize. The passed device argument of the function is platform_bus variable in this case.

1
2
3
4
struct device platform_bus = {
        .init_name      = "platform",
};
EXPORT_SYMBOL_GPL(platform_bus);

Although this device structure has only single initialized member field, device structure contains bunch of other fields used to abstract physical/virtual device in the driver sub-system.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
/**
 * struct device - The basic device structure
 * @parent:     The device's "parent" device, the device to which it is attached.
 *              In most cases, a parent device is some sort of bus or host
 *              controller. If parent is NULL, the device, is a top-level device,
 *              which is not usually what you want.
 * @p:          Holds the private data of the driver core portions of the device.
 *              See the comment of the struct device_private for detail.
 * @kobj:       A top-level, abstract class from which other classes are derived.
 * @init_name:  Initial name of the device.
 * @type:       The type of device.
 *              This identifies the device type and carries type-specific
 *              information.
 * @mutex:      Mutex to synchronize calls to its driver.
 * @lockdep_mutex: An optional debug lock that a subsystem can use as a
 *              peer lock to gain localized lockdep coverage of the device_lock.
 * @bus:        Type of bus device is on.
 * @driver:     Which driver has allocated this
 * @platform_data: Platform data specific to the device.
 *              Example: For devices on custom boards, as typical of embedded
 *              and SOC based hardware, Linux often uses platform_data to point
 *              to board-specific structures describing devices and how they
 *              are wired.  That can include what ports are available, chip
 *              variants, which GPIO pins act in what additional roles, and so
 *              on.  This shrinks the "Board Support Packages" (BSPs) and
 *              minimizes board-specific #ifdefs in drivers.
 * @driver_data: Private pointer for driver specific info.
 * @links:      Links to suppliers and consumers of this device.
 * @power:      For device power management.
 *              See Documentation/driver-api/pm/devices.rst for details.
 * @pm_domain:  Provide callbacks that are executed during system suspend,
 *              hibernation, system resume and during runtime PM transitions
 *              along with subsystem-level and driver-level callbacks.
 * @em_pd:      device's energy model performance domain
 * @pins:       For device pin management.
 *              See Documentation/driver-api/pinctl.rst for details.
 * @msi_list:   Hosts MSI descriptors
 * @msi_domain: The generic MSI domain this device is using.
 * @numa_node:  NUMA node this device is close to.
 * @dma_ops:    DMA mapping operations for this device.
 * @dma_mask:   Dma mask (if dma'ble device).
 * @coherent_dma_mask: Like dma_mask, but for alloc_coherent mapping as not all
 *              hardware supports 64-bit addresses for consistent allocations
 *              such descriptors.
 * @bus_dma_limit: Limit of an upstream bridge or bus which imposes a smaller
 *              DMA limit than the device itself supports.
 * @dma_range_map: map for DMA memory ranges relative to that of RAM
 * @dma_parms:  A low level driver may set these to teach IOMMU code about
 *              segment limitations.
 * @dma_pools:  Dma pools (if dma'ble device).
 * @dma_mem:    Internal for coherent mem override.
 * @cma_area:   Contiguous memory area for dma allocations
 * @archdata:   For arch-specific additions.
 * @of_node:    Associated device tree node.
 * @fwnode:     Associated device node supplied by platform firmware.
 * @devt:       For creating the sysfs "dev".
 * @id:         device instance
 * @devres_lock: Spinlock to protect the resource of the device.
 * @devres_head: The resources list of the device.
 * @knode_class: The node used to add the device to the class list.
 * @class:      The class of the device.
 * @groups:     Optional attribute groups.
 * @release:    Callback to free the device after all references have
 *              gone away. This should be set by the allocator of the
 *              device (i.e. the bus driver that discovered the device).
 * @iommu_group: IOMMU group the device belongs to.
 * @iommu:      Per device generic IOMMU runtime data
 *
 * @offline_disabled: If set, the device is permanently online.
 * @offline:    Set after successful invocation of bus type's .offline().
 * @of_node_reused: Set if the device-tree node is shared with an ancestor
 *              device.
 * @state_synced: The hardware state of this device has been synced to match
 *                the software state of this device by calling the driver/bus
 *                sync_state() callback.
 * @dma_coherent: this particular device is dma coherent, even if the
 *              architecture supports non-coherent devices.
 * @dma_ops_bypass: If set to %true then the dma_ops are bypassed for the
 *              streaming DMA operations (->map_* / ->unmap_* / ->sync_*),
 *              and optionall (if the coherent mask is large enough) also
 *              for dma allocations.  This flag is managed by the dma ops
 *              instance from ->dma_supported.
 *
 * At the lowest level, every device in a Linux system is represented by an
 * instance of struct device. The device structure contains the information
 * that the device model core needs to model the system. Most subsystems,
 * however, track additional information about the devices they host. As a
 * result, it is rare for devices to be represented by bare device structures;
 * instead, that structure, like kobject structures, is usually embedded within
 * a higher-level representation of the device.
 */
struct device {
        struct kobject kobj;
        struct device           *parent;

        struct device_private   *p;

        const char              *init_name; /* initial name of the device */
        const struct device_type *type;

        struct bus_type *bus;           /* type of bus device is on */
        struct device_driver *driver;   /* which driver has allocated this
                                           device */
        void            *platform_data; /* Platform specific data, device
                                           core doesn't touch it */
        void            *driver_data;   /* Driver data, set and get with
                                           dev_set_drvdata/dev_get_drvdata */
#ifdef CONFIG_PROVE_LOCKING
        struct mutex            lockdep_mutex;
#endif
        struct mutex            mutex;  /* mutex to synchronize calls to
                                         * its driver.
                                         */

        struct dev_links_info   links;
        struct dev_pm_info      power;
        struct dev_pm_domain    *pm_domain

#ifdef CONFIG_ENERGY_MODEL
        struct em_perf_domain   *em_pd;
#endif

#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
        struct irq_domain       *msi_domain;
#endif
#ifdef CONFIG_PINCTRL
        struct dev_pin_info     *pins;
#endif
#ifdef CONFIG_GENERIC_MSI_IRQ
        struct list_head        msi_list;
#endif
#ifdef CONFIG_DMA_OPS
        const struct dma_map_ops *dma_ops;
#endif
        u64             *dma_mask;      /* dma mask (if dma'able device) */
        u64             coherent_dma_mask;/* Like dma_mask, but for
                                             alloc_coherent mappings as
                                             not all hardware supports
                                             64 bit addresses for consistent
                                             allocations such descriptors. */
        u64             bus_dma_limit;  /* upstream dma constraint */
        const struct bus_dma_region *dma_range_map;

        struct device_dma_parameters *dma_parms;

        struct list_head        dma_pools;      /* dma pools (if dma'ble) */

#ifdef CONFIG_DMA_DECLARE_COHERENT
        struct dma_coherent_mem *dma_mem; /* internal for coherent mem
                                             override */
#endif
#ifdef CONFIG_DMA_CMA
        struct cma *cma_area;           /* contiguous memory area for dma
                                           allocations */
#endif
        /* arch specific additions */
        struct dev_archdata     archdata;

        struct device_node      *of_node; /* associated device tree node */
        struct fwnode_handle    *fwnode; /* firmware device node */

#ifdef CONFIG_NUMA
        int             numa_node;      /* NUMA node this device is close to */
#endif
        dev_t                   devt;   /* dev_t, creates the sysfs "dev" */
        u32                     id;     /* device instance */

        spinlock_t              devres_lock;
        struct list_head        devres_head;

        struct class            *class;
        const struct attribute_group **groups;  /* optional groups */

        void    (*release)(struct device *dev);
        struct iommu_group      *iommu_group;
        struct dev_iommu        *iommu;

        bool                    offline_disabled:1;
        bool                    offline:1;
        bool                    of_node_reused:1;
        bool                    state_synced:1;
#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
        bool                    dma_coherent:1;
#endif
#ifdef CONFIG_DMA_OPS_BYPASS
        bool                    dma_ops_bypass : 1;
#endif
};

The device_intialize function initializes device links that manage consumer and suppliers of this device, devres list which manages all resources used by the device, dma, mutex, etc. Lastly, it sets current device status as DL_DEV_NO_DRIVER which means the device has no driver attached to it.

1
2
3
4
5
6
7
8
9
10
11
12
13
/**     
 * enum dl_dev_state - Device driver presence tracking information.
 * @DL_DEV_NO_DRIVER: There is no driver attached to the device.
 * @DL_DEV_PROBING: A driver is probing.
 * @DL_DEV_DRIVER_BOUND: The driver has been bound to the device.
 * @DL_DEV_UNBINDING: The driver is unbinding from the device.
 */     
enum dl_dev_state {
        DL_DEV_NO_DRIVER = 0,
        DL_DEV_PROBING,
        DL_DEV_DRIVER_BOUND,
        DL_DEV_UNBINDING,
};

Add platform_bus device to the driver subsystem

After initializing kobject and some fields of the platform_bus device, device_add function will register the device to the driver sub-system.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
/**
 * device_add - add device to device hierarchy.
 * @dev: device.
 *
 * This is part 2 of device_register(), though may be called
 * separately _iff_ device_initialize() has been called separately.
 *
 * This adds @dev to the kobject hierarchy via kobject_add(), adds it
 * to the global and sibling lists for the device, then
 * adds it to the other relevant subsystems of the driver model.
 *
 * Do not call this routine or device_register() more than once for
 * any device structure.  The driver model core is not designed to work
 * with devices that get unregistered and then spring back to life.
 * (Among other things, it's very hard to guarantee that all references
 * to the previous incarnation of @dev have been dropped.)  Allocate
 * and register a fresh new struct device instead.
 *
 * NOTE: _Never_ directly free @dev after calling this function, even
 * if it returned an error! Always use put_device() to give up your
 * reference instead.
 *
 * Rule of thumb is: if device_add() succeeds, you should call
 * device_del() when you want to get rid of it. If device_add() has
 * *not* succeeded, use *only* put_device() to drop the reference
 * count.
 */
int device_add(struct device *dev)
{
        struct device *parent;
        struct kobject *kobj;
        struct class_interface *class_intf;
        int error = -EINVAL;
        struct kobject *glue_dir = NULL;

        dev = get_device(dev);
        if (!dev)
                goto done;

        if (!dev->p) {
                error = device_private_init(dev);
                if (error)
                        goto done;
        }

        /*
         * for statically allocated devices, which should all be converted
         * some day, we need to initialize the name. We prevent reading back
         * the name, and force the use of dev_name()
         */
        if (dev->init_name) {
                dev_set_name(dev, "%s", dev->init_name);
                dev->init_name = NULL;
        }

        /* subsystems can specify simple device enumeration */
        if (!dev_name(dev) && dev->bus && dev->bus->dev_name)
                dev_set_name(dev, "%s%u", dev->bus->dev_name, dev->id);

        if (!dev_name(dev)) {
                error = -EINVAL;
                goto name_error;
        }

        pr_debug("device: '%s': %s\n", dev_name(dev), __func__);

        parent = get_device(dev->parent);
        kobj = get_device_parent(dev, parent);
        if (IS_ERR(kobj)) {
                error = PTR_ERR(kobj);
                goto parent_error;
        }
        if (kobj)
                dev->kobj.parent = kobj;

        /* use parent numa_node */
        if (parent && (dev_to_node(dev) == NUMA_NO_NODE))
                set_dev_node(dev, dev_to_node(parent));

        /* first, register with generic layer. */
        /* we require the name to be set before, and pass NULL */
        error = kobject_add(&dev->kobj, dev->kobj.parent, NULL);
        if (error) {
                glue_dir = get_glue_dir(dev);
                goto Error;
        }

        /* notify platform of device entry */
        error = device_platform_notify(dev, KOBJ_ADD);
        if (error)
                goto platform_error;

        error = device_create_file(dev, &dev_attr_uevent);
        if (error)
                goto attrError;

        error = device_add_class_symlinks(dev);
        if (error)
                goto SymlinkError;
        error = device_add_attrs(dev);
        if (error)
                goto AttrsError;
        error = bus_add_device(dev);
        if (error)
                goto BusError;
        error = dpm_sysfs_add(dev);
        if (error)
                goto DPMError;
        device_pm_add(dev);

        if (MAJOR(dev->devt)) {
                error = device_create_file(dev, &dev_attr_dev);
                if (error)
                        goto DevAttrError;

                error = device_create_sys_dev_entry(dev);
                if (error)
                        goto SysEntryError;

                devtmpfs_create_node(dev);
        }

        /* Notify clients of device addition.  This call must come
         * after dpm_sysfs_add() and before kobject_uevent().
         */
        if (dev->bus)
                blocking_notifier_call_chain(&dev->bus->p->bus_notifier,
                                             BUS_NOTIFY_ADD_DEVICE, dev);

        kobject_uevent(&dev->kobj, KOBJ_ADD);

        /*
         * Check if any of the other devices (consumers) have been waiting for
         * this device (supplier) to be added so that they can create a device
         * link to it.
         *
         * This needs to happen after device_pm_add() because device_link_add()
         * requires the supplier be registered before it's called.
         *
         * But this also needs to happen before bus_probe_device() to make sure
         * waiting consumers can link to it before the driver is bound to the
         * device and the driver sync_state callback is called for this device.
         */
        if (dev->fwnode && !dev->fwnode->dev) {
                dev->fwnode->dev = dev;
                fw_devlink_link_device(dev);
        }

        bus_probe_device(dev);
        if (parent)
                klist_add_tail(&dev->p->knode_parent,
                               &parent->p->klist_children);

        if (dev->class) {
                mutex_lock(&dev->class->p->mutex);
                /* tie the class to the device */
                klist_add_tail(&dev->p->knode_class,
                               &dev->class->p->klist_devices);

                /* notify any interfaces that the device is here */
                list_for_each_entry(class_intf,
                                    &dev->class->p->interfaces, node)
                        if (class_intf->add_dev)
                                class_intf->add_dev(dev, class_intf);
                mutex_unlock(&dev->class->p->mutex);
        }
done:
        put_device(dev);
        return error;
 SysEntryError:
        if (MAJOR(dev->devt))
                device_remove_file(dev, &dev_attr_dev);
 DevAttrError:
        device_pm_remove(dev);
        dpm_sysfs_remove(dev);
 DPMError:
        bus_remove_device(dev);
 BusError:
        device_remove_attrs(dev);
 AttrsError:
        device_remove_class_symlinks(dev);
 SymlinkError:
        device_remove_file(dev, &dev_attr_uevent);
 attrError:
        device_platform_notify(dev, KOBJ_REMOVE);
platform_error:
        kobject_uevent(&dev->kobj, KOBJ_REMOVE);
        glue_dir = get_glue_dir(dev);
        kobject_del(&dev->kobj);
 Error:
        cleanup_glue_dir(dev, glue_dir);
parent_error:
        put_device(parent);
name_error:
        kfree(dev->p);
        dev->p = NULL;
        goto done;
}
EXPORT_SYMBOL_GPL(device_add);

We can easily find that weird behavior of the device_add, which invokes get_device(dev) function to retrieve the reference to the device even though we have an access to it.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
struct device *get_device(struct device *dev)
{
        return dev ? kobj_to_dev(kobject_get(&dev->kobj)) : NULL;
}
EXPORT_SYMBOL_GPL(get_device);

struct kobject *kobject_get(struct kobject *kobj)
{
        if (kobj) {
                if (!kobj->state_initialized)
                        WARN(1, KERN_WARNING
                                "kobject: '%s' (%p): is not initialized, yet kobject_get() is being called.\n",
                             kobject_name(kobj), kobj);
                kref_get(&kobj->kref);
        }
        return kobj; 
}
EXPORT_SYMBOL(kobject_get);

static inline struct device *kobj_to_dev(struct kobject *kobj)
{
        return container_of(kobj, struct device, kobj);
}

This is because we now have an initialized kobject embedded in the device. When the get_device function is invoked, it just returns the device reference, but it increases the reference counter of the embedded kobject of the device. The kobject_get function increases the reference counter, and return the kobject reference of the device. The kobj_to_dev function just utilize the container_of macro to retrieve the device structure associated with current kobject.

Because platform_bus device is statically allocated and has no parent device, its parent field is NULL and we can just ignore all functions related with retrieving its parent device object. After that, it invokes kobject_add function which register the platform_bus device to its kset, devices_kset.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
/**
 * kobject_add() - The main kobject add function.
 * @kobj: the kobject to add
 * @parent: pointer to the parent of the kobject.
 * @fmt: format to name the kobject with.
 *
 * The kobject name is set and added to the kobject hierarchy in this
 * function.
 *
 * If @parent is set, then the parent of the @kobj will be set to it.
 * If @parent is NULL, then the parent of the @kobj will be set to the
 * kobject associated with the kset assigned to this kobject.  If no kset
 * is assigned to the kobject, then the kobject will be located in the
 * root of the sysfs tree.
 *
 * Note, no "add" uevent will be created with this call, the caller should set
 * up all of the necessary sysfs files for the object and then call
 * kobject_uevent() with the UEVENT_ADD parameter to ensure that
 * userspace is properly notified of this kobject's creation.
 *      
 * Return: If this function returns an error, kobject_put() must be
 *         called to properly clean up the memory associated with the
 *         object.  Under no instance should the kobject that is passed
 *         to this function be directly freed with a call to kfree(),
 *         that can leak memory.
 *
 *         If this function returns success, kobject_put() must also be called
 *         in order to properly clean up the memory associated with the object.
 *              
 *         In short, once this function is called, kobject_put() MUST be called
 *         when the use of the object is finished in order to properly free
 *         everything.
 */     

int kobject_add(struct kobject *kobj, struct kobject *parent, 
                const char *fmt, ...)
{        
        va_list args;
        int retval;
                
        if (!kobj)
                return -EINVAL;

        if (!kobj->state_initialized) {
                pr_err("kobject '%s' (%p): tried to add an uninitialized object, something is seriously wrong.\n",
                       kobject_name(kobj), kobj);
                dump_stack();
                return -EINVAL;
        }       
        va_start(args, fmt);
        retval = kobject_add_varg(kobj, parent, fmt, args);
        va_end(args);
        
        return retval;
}       
EXPORT_SYMBOL(kobject_add);

Here, note that its parent and fmt argument is set as NULL because platform_bus device has no parent device, and kobj points to the platform_bus device itself. First conditional statement confirms whether the device structure has been initialized by checking state_initialized field of kobject of platform_bus device. This field has been initialized at the first part of device_register function, device_initialize.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
static int kobject_add_internal(struct kobject *kobj)
{
        int error = 0;
        struct kobject *parent;

        if (!kobj)
                return -ENOENT;

        if (!kobj->name || !kobj->name[0]) {
                WARN(1,
                     "kobject: (%p): attempted to be registered with empty name!\n",
                     kobj);
                return -EINVAL;
        }

        parent = kobject_get(kobj->parent);

        /* join kset if set, use it as parent if we do not already have one */
        if (kobj->kset) {
                if (!parent)
                        parent = kobject_get(&kobj->kset->kobj);
                kobj_kset_join(kobj);
                kobj->parent = parent;
        }

        pr_debug("kobject: '%s' (%p): %s: parent: '%s', set: '%s'\n",
                 kobject_name(kobj), kobj, __func__,
                 parent ? kobject_name(parent) : "<NULL>",
                 kobj->kset ? kobject_name(&kobj->kset->kobj) : "<NULL>");

        error = create_dir(kobj);
        if (error) {
                kobj_kset_leave(kobj);
                kobject_put(parent);
                kobj->parent = NULL;

                /* be noisy on error issues */
                if (error == -EEXIST)
                        pr_err("%s failed for %s with -EEXIST, don't try to register things with the same name in the same directory.\n",
                               __func__, kobject_name(kobj));
                else
                        pr_err("%s failed for %s (error: %d parent: %s)\n",
                               __func__, kobject_name(kobj), error,
                               parent ? kobject_name(parent) : "'none'");
        } else
                kobj->state_in_sysfs = 1;

        return error;
}

It register the kobject of the platform_bus to its kset using kobj_kset_join function. Also because this device doesn’t have parent, its kobject deosn’t have parent neither. Therefore, it allocates the kset’s kobject as its parent kobject.

Although the current device has no attached bus because it is the bus device itself, if it were a end-device, it needs to be attached to the proper bus by bus_add_device function.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
/**
 * bus_add_device - add device to bus
 * @dev: device being added
 *              
 * - Add device's bus attributes.
 * - Create links to device's bus.
 * - Add the device to its bus's list of devices.
 */     
int bus_add_device(struct device *dev)
{               
        struct bus_type *bus = bus_get(dev->bus);
        int error = 0;
                
        if (bus) {
                pr_debug("bus: '%s': add device %s\n", bus->name, dev_name(dev));
                error = device_add_groups(dev, bus->dev_groups);
                if (error)
                        goto out_put;
                error = sysfs_create_link(&bus->p->devices_kset->kobj,
                                                &dev->kobj, dev_name(dev));
                if (error)
                        goto out_groups;
                error = sysfs_create_link(&dev->kobj,
                                &dev->bus->p->subsys.kobj, "subsystem");
                if (error)
                        goto out_subsys;
                klist_add_tail(&dev->p->knode_bus, &bus->p->klist_devices);
        }
        return 0;
         
out_subsys:
        sysfs_remove_link(&bus->p->devices_kset->kobj, dev_name(dev));
out_groups:                                  
        device_remove_groups(dev, bus->dev_groups);
out_put:
        bus_put(dev->bus);
        return error;
}  

The most important part of this function is registering the device’s knode for bus, knode_bus to the bus’ klist_device klist. This registration is done by klist_add_tail function. After device attachment is done, the device driver that can handle the registered device should be bound to the device.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
/**             
 * bus_probe_device - probe drivers for a new device
 * @dev: device to probe
 *
 * - Automatically probe for a driver if the bus allows it.
 */     
void bus_probe_device(struct device *dev)
{                              
        struct bus_type *bus = dev->bus;
        struct subsys_interface *sif;
                
        if (!bus)
                return;
                               
        if (bus->p->drivers_autoprobe)
                device_initial_probe(dev);
                
        mutex_lock(&bus->p->mutex); 
        list_for_each_entry(sif, &bus->p->interfaces, node)
                if (sif->add_dev)
                        sif->add_dev(dev, sif);
        mutex_unlock(&bus->p->mutex);
}

If the current device is attached to the bus and autoprobe of the bus has been enabled, it invokes device_initial_probe that will actually searches the device driver and binding. The details will be coverd in this post.

Bus registration

What we have done for platform_bus so far is registering platform_bus as device to the driver sub-system. In other words, because even the bus is treated and managed by the driver sub-system in the Linux kernel, we have to firstly create the device for the bus and register it to the system. However, because the bus device is used to manage other devices attached to it it should have its private sub-system for managing the attached devices.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
int __init platform_bus_init(void)
{
        int error;

        early_platform_cleanup();

        error = device_register(&platform_bus);
        if (error) {
                put_device(&platform_bus);
                return error;
        }
        error =  bus_register(&platform_bus_type);
        if (error)
                device_unregister(&platform_bus);
        of_platform_register_reconfig_notifier();
        return error;
}

When we revisit the platform_bus_init function, we can find that bus_register function is called with platform_bus_type which is an object of bus_type. Let’s take what is the purpose of bus_type and implementation of platform_bus_type.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
/**
 * struct bus_type - The bus type of the device
 *
 * @name:       The name of the bus.
 * @dev_name:   Used for subsystems to enumerate devices like ("foo%u", dev->id).
 * @dev_root:   Default device to use as the parent.
 * @bus_groups: Default attributes of the bus.
 * @dev_groups: Default attributes of the devices on the bus.
 * @drv_groups: Default attributes of the device drivers on the bus.
 * @match:      Called, perhaps multiple times, whenever a new device or driver
 *              is added for this bus. It should return a positive value if the
 *              given device can be handled by the given driver and zero
 *              otherwise. It may also return error code if determining that
 *              the driver supports the device is not possible. In case of
 *              -EPROBE_DEFER it will queue the device for deferred probing.
 * @uevent:     Called when a device is added, removed, or a few other things
 *              that generate uevents to add the environment variables.
 * @probe:      Called when a new device or driver add to this bus, and callback
 *              the specific driver's probe to initial the matched device.
 * @sync_state: Called to sync device state to software state after all the
 *              state tracking consumers linked to this device (present at
 *              the time of late_initcall) have successfully bound to a
 *              driver. If the device has no consumers, this function will
 *              be called at late_initcall_sync level. If the device has
 *              consumers that are never bound to a driver, this function
 *              will never get called until they do.
 * @remove:     Called when a device removed from this bus.
 * @shutdown:   Called at shut-down time to quiesce the device.
 *
 * @online:     Called to put the device back online (after offlining it).
 * @offline:    Called to put the device offline for hot-removal. May fail.
 *
 * @suspend:    Called when a device on this bus wants to go to sleep mode.
 * @resume:     Called to bring a device on this bus out of sleep mode.
 * @num_vf:     Called to find out how many virtual functions a device on this
 *              bus supports.
 * @dma_configure:      Called to setup DMA configuration on a device on
 *                      this bus.
 * @pm:         Power management operations of this bus, callback the specific
 *              device driver's pm-ops.
 * @iommu_ops:  IOMMU specific operations for this bus, used to attach IOMMU
 *              driver implementations to a bus and allow the driver to do
 *              bus-specific setup
 * @p:          The private data of the driver core, only the driver core can
 *              touch this.
 * @lock_key:   Lock class key for use by the lock validator
 * @need_parent_lock:   When probing or removing a device on this bus, the
 *                      device core should lock the device's parent.
 *
 * A bus is a channel between the processor and one or more devices. For the
 * purposes of the device model, all devices are connected via a bus, even if
 * it is an internal, virtual, "platform" bus. Buses can plug into each other.
 * A USB controller is usually a PCI device, for example. The device model
 * represents the actual connections between buses and the devices they control.
 * A bus is represented by the bus_type structure. It contains the name, the
 * default attributes, the bus' methods, PM operations, and the driver core's
 * private data.
 */
struct bus_type {
        const char              *name;
        const char              *dev_name;
        struct device           *dev_root;
        const struct attribute_group **bus_groups;
        const struct attribute_group **dev_groups;
        const struct attribute_group **drv_groups;

        int (*match)(struct device *dev, struct device_driver *drv);
        int (*uevent)(struct device *dev, struct kobj_uevent_env *env);
        int (*probe)(struct device *dev);
        void (*sync_state)(struct device *dev);
        int (*remove)(struct device *dev);
        void (*shutdown)(struct device *dev);

        int (*online)(struct device *dev);
        int (*offline)(struct device *dev);

        int (*suspend)(struct device *dev, pm_message_t state);
        int (*resume)(struct device *dev);

        int (*num_vf)(struct device *dev);

        int (*dma_configure)(struct device *dev);

        const struct dev_pm_ops *pm;

        const struct iommu_ops *iommu_ops;

        struct subsys_private *p;
        struct lock_class_key lock_key;

        bool need_parent_lock;
};

struct bus_type platform_bus_type = {
        .name           = "platform",
        .dev_groups     = platform_dev_groups,
        .match          = platform_match,
        .uevent         = platform_uevent,
        .probe          = platform_probe,
        .remove         = platform_remove,
        .shutdown       = platform_shutdown,
        .dma_configure  = platform_dma_configure,
        .pm             = &platform_dev_pm_ops,
};

The bus_type structure contains some information about the bus itself and bunch of call-back functions required to manage bus and its sub-devices attached to the bus. Also, it maintains subsys_private pointer that actually used to manage the bus and its associated devices and drivers.

Let’s take a look at how the bus_register function actually register a new bus sub-system.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
/**
 * bus_register - register a driver-core subsystem
 * @bus: bus to register
 *
 * Once we have that, we register the bus with the kobject
 * infrastructure, then register the children subsystems it has:
 * the devices and drivers that belong to the subsystem.
 */
int bus_register(struct bus_type *bus)
{
        int retval;
        struct subsys_private *priv;
        struct lock_class_key *key = &bus->lock_key;

        priv = kzalloc(sizeof(struct subsys_private), GFP_KERNEL);
        if (!priv)
                return -ENOMEM;

        priv->bus = bus;
        bus->p = priv;

        BLOCKING_INIT_NOTIFIER_HEAD(&priv->bus_notifier);

        retval = kobject_set_name(&priv->subsys.kobj, "%s", bus->name);
        if (retval)
                goto out;

        priv->subsys.kobj.kset = bus_kset;
        priv->subsys.kobj.ktype = &bus_ktype;
        priv->drivers_autoprobe = 1;

        retval = kset_register(&priv->subsys);
        if (retval)
                goto out;

        retval = bus_create_file(bus, &bus_attr_uevent);
        if (retval)
                goto bus_uevent_fail;

        priv->devices_kset = kset_create_and_add("devices", NULL,
                                                 &priv->subsys.kobj);
        if (!priv->devices_kset) {
                retval = -ENOMEM;
                goto bus_devices_fail;
        }

        priv->drivers_kset = kset_create_and_add("drivers", NULL,
                                                 &priv->subsys.kobj);
        if (!priv->drivers_kset) {
                retval = -ENOMEM;
                goto bus_drivers_fail;
        }

        INIT_LIST_HEAD(&priv->interfaces);
        __mutex_init(&priv->mutex, "subsys mutex", key);
        klist_init(&priv->klist_devices, klist_devices_get, klist_devices_put);
        klist_init(&priv->klist_drivers, NULL, NULL);

        retval = add_probe_files(bus);
        if (retval)
                goto bus_probe_files_fail;

        retval = bus_add_groups(bus, bus->bus_groups);
        if (retval)
                goto bus_groups_fail;

        pr_debug("bus: '%s': registered\n", bus->name);
        return 0;

bus_groups_fail:
        remove_probe_files(bus);
bus_probe_files_fail:
        kset_unregister(bus->p->drivers_kset);
bus_drivers_fail:
        kset_unregister(bus->p->devices_kset);
bus_devices_fail:
        bus_remove_file(bus, &bus_attr_uevent);
bus_uevent_fail:
        kset_unregister(&bus->p->subsys);
out:
        kfree(bus->p);
        bus->p = NULL;
        return retval;
}
EXPORT_SYMBOL_GPL(bus_register);

/**
 * struct subsys_private - structure to hold the private to the driver core portions of the bus_type/class structure.
 *
 * @subsys - the struct kset that defines this subsystem
 * @devices_kset - the subsystem's 'devices' directory
 * @interfaces - list of subsystem interfaces associated
 * @mutex - protect the devices, and interfaces lists.
 *
 * @drivers_kset - the list of drivers associated
 * @klist_devices - the klist to iterate over the @devices_kset
 * @klist_drivers - the klist to iterate over the @drivers_kset
 * @bus_notifier - the bus notifier list for anything that cares about things
 *                 on this bus.
 * @bus - pointer back to the struct bus_type that this structure is associated
 *        with.
 *
 * @glue_dirs - "glue" directory to put in-between the parent device to
 *              avoid namespace conflicts
 * @class - pointer back to the struct class that this structure is associated
 *          with.
 *
 * This structure is the one that is the actual kobject allowing struct
 * bus_type/class to be statically allocated safely.  Nothing outside of the
 * driver core should ever touch these fields.
 */
struct subsys_private {
        struct kset subsys;
        struct kset *devices_kset;
        struct list_head interfaces;
        struct mutex mutex;

        struct kset *drivers_kset;
        struct klist klist_devices;
        struct klist klist_drivers;
        struct blocking_notifier_head bus_notifier;
        unsigned int drivers_autoprobe:1;
        struct bus_type *bus;

        struct kset glue_dirs;
        struct class *class;
};

The most important role of the bus_register function is generating a subsystem for the bus device, which is used to manage the devices and their drivers sitting on the bus. Therefore, to understand how the registered bus manages devices attached to the bus and its associated drivers, we have to understand the data structure subsys_private which contains most important data fields regarding device and driver management.

The first thing done by the bus_register function is allocating the subsys_private object and initialization. The function chains the bus_type structure and subsys_private structure by assigning the pointers to each other as their memeber fields respectively. Therefore, when you have either bus_type or subsys_private object, you can reference the other also. It assigns the generated subsys_private object to bus object memeber field, p.

Because subsystems are also managed by as kset and kobjects system, the kobject should be properly registered to a kset. You might remeber that buses_init function genereates kset named bus_kset. Note that generated subsystem’s kobject’s kset field points to the bus_kset, which means the generated subsystem will be managed by bus_kset. kset_register function actually handles registration process.

Next important thing is intializing klist for managing devices and its associated drivers that could sit on the current bus, klist_devices and klist_drivers. Note that this klist_devices kset is dedicated for the current bus subsystem.

1
2
3
4
5
int __init devices_init(void)
{
        devices_kset = kset_create_and_add("devices", &device_uevent_ops, NULL);
        ...
}

Although previous devices_init function also genereated the kset named “devices” and assign this to devices_kset, priv->devices_kset which is dedicated to a one bus sub-system is totally different. The devices_kset of the bus subsystem only manages devices attached to the bus, but the devices_kset is orchestrates devices in the whole Linux driver system.

Add notifier block to device file notifier chain

Because device file heavily make use of platform device, it needs to register notifer of platform device, platform_of_notifer, to the devie file’s notifier chain. Before we take a look at which platform related function of the notifier block will be added to the device file’s notifier chain, let’s take a look at the API functions that help us to register platform notifier block.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
void of_platform_register_reconfig_notifier(void)
{
        WARN_ON(of_reconfig_notifier_register(&platform_of_notifier));
}   

static BLOCKING_NOTIFIER_HEAD(of_reconfig_chain);
int of_reconfig_notifier_register(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&of_reconfig_chain, nb);
}       

/* 
 *      Blocking notifier chain routines.  All access to the chain is
 *      synchronized by an rwsem.
 */     

/**
 *      blocking_notifier_chain_register - Add notifier to a blocking notifier chain
 *      @nh: Pointer to head of the blocking notifier chain
 *      @n: New entry in notifier chain
 *
 *      Adds a notifier to a blocking notifier chain.
 *      Must be called in process context.
 *
 *      Currently always returns zero.
 */
int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
                struct notifier_block *n)
{               
        int ret;

        /*
         * This code gets used during boot-up, when task switching is
         * not yet working and interrupts must remain disabled.  At
         * such times we must not call down_write().
         */
        if (unlikely(system_state == SYSTEM_BOOTING))
                return notifier_chain_register(&nh->head, n);

        down_write(&nh->rwsem); 
        ret = notifier_chain_register(&nh->head, n);
        up_write(&nh->rwsem);
        return ret;
}
EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);

of_reconfig_chain is a list that manages all registered notifier blocks of the device file. The blocking_notifier_chain_register helps us register the notifier block to the list.

Let’s take what function is included in the platform notifier block.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
static struct notifier_block platform_of_notifier = {
        .notifier_call = of_platform_notify,
};  

static int of_platform_notify(struct notifier_block *nb,
                                unsigned long action, void *arg)
{
        struct of_reconfig_data *rd = arg; 
        struct platform_device *pdev_parent, *pdev;
        bool children_left;
 
        switch (of_reconfig_get_state_change(action, rd)) {
        case OF_RECONFIG_CHANGE_ADD:
                /* verify that the parent is a bus */
                if (!of_node_check_flag(rd->dn->parent, OF_POPULATED_BUS))
                        return NOTIFY_OK;       /* not for us */

                /* already populated? (driver using of_populate manually) */
                if (of_node_check_flag(rd->dn, OF_POPULATED))
                        return NOTIFY_OK;

                /* pdev_parent may be NULL when no bus platform device */
                pdev_parent = of_find_device_by_node(rd->dn->parent);
                pdev = of_platform_device_create(rd->dn, NULL,
                                pdev_parent ? &pdev_parent->dev : NULL);
                of_dev_put(pdev_parent);

                if (pdev == NULL) {
                        pr_err("%s: failed to create for '%pOF'\n",
                                        __func__, rd->dn);
                        /* of_platform_device_create tosses the error code */
                        return notifier_from_errno(-EINVAL);
                }
                break;
        
        case OF_RECONFIG_CHANGE_REMOVE:
        
                /* already depopulated? */
                if (!of_node_check_flag(rd->dn, OF_POPULATED))
                        return NOTIFY_OK;

                /* find our device by node */
                pdev = of_find_device_by_node(rd->dn);
                if (pdev == NULL)
                        return NOTIFY_OK;       /* no? not meant for us */

                /* unregister takes one ref away */
                of_platform_device_destroy(&pdev->dev, &children_left);
        
                /* and put the reference of the find */
                of_dev_put(pdev);
                break;
        }               
                
        return NOTIFY_OK;
}       



Detect and add platform devices

So far we explored how the platform bus has been initialized as sub-system of the entire driver system of the Linux kernel. Then how can we detect the devices that needs to be registered as platform device instead of other usb or pci devices which can be automatically detected by the bus supporting hot-plugging? Although there are several other parts of the kernel register the devices as platform device, the biggest chance to discover and register most of the platform devices exists in the device tree. Therefore, let’s take a look at the initialization function of the device tree.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
static int __init of_platform_default_populate_init(void)
{       
        struct device_node *node;

        device_links_supplier_sync_state_pause();

        if (!of_have_populated_dt())
                return -ENODEV;
 
        /*
         * Handle certain compatibles explicitly, since we don't want to create
         * platform_devices for every node in /reserved-memory with a
         * "compatible",
         */                      
        for_each_matching_node(node, reserved_mem_matches)
                of_platform_device_create(node, NULL, NULL);
 
        node = of_find_node_by_path("/firmware");
        if (node) { 
                of_platform_populate(node, NULL, NULL, NULL);
                of_node_put(node);
        }
        
        /* Populate everything else. */ 
        of_platform_default_populate(NULL, NULL, NULL);

        return 0;
}
arch_initcall_sync(of_platform_default_populate_init);

We can find that of_platform_default_populate_init function is set as initcall function that will be executed during do_initcall function. The detailed information of the do_initcall and its related macros are going to explained in other posts.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
int of_platform_default_populate(struct device_node *root,
                                 const struct of_dev_auxdata *lookup,
                                 struct device *parent)
{
        return of_platform_populate(root, of_default_bus_match_table, lookup,
                                    parent);
}
EXPORT_SYMBOL_GPL(of_platform_default_populate)

const struct of_device_id of_default_bus_match_table[] = {
        { .compatible = "simple-bus", },
        { .compatible = "simple-mfd", },
        { .compatible = "isa", },
#ifdef CONFIG_ARM_AMBA
        { .compatible = "arm,amba-bus", },
#endif /* CONFIG_ARM_AMBA */
        {} /* Empty terminated list */
};

of_platform_default_populate function is a wrapper function of the of_platform_populate function which is set to be invoked with of_default_bus_match_table as its second parameter. The string contained in the match table will be used to find a node where its compatible string is same as one of the specified compatible string. Also because this is the first function to traverse the device tree, it have NULL value for first and third argument, root and parent.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
/**
 * of_platform_populate() - Populate platform_devices from device tree data
 * @root: parent of the first level to probe or NULL for the root of the tree
 * @matches: match table, NULL to use the default
 * @lookup: auxdata table for matching id and platform_data with device nodes
 * @parent: parent to hook devices from, NULL for toplevel
 *
 * Similar to of_platform_bus_probe(), this function walks the device tree
 * and creates devices from nodes.  It differs in that it follows the modern
 * convention of requiring all device nodes to have a 'compatible' property,
 * and it is suitable for creating devices which are children of the root
 * node (of_platform_bus_probe will only create children of the root which
 * are selected by the @matches argument).
 *
 * New board support should be using this function instead of
 * of_platform_bus_probe().
 *
 * Returns 0 on success, < 0 on failure.
 */
int of_platform_populate(struct device_node *root,
                        const struct of_device_id *matches,
                        const struct of_dev_auxdata *lookup,
                        struct device *parent)
{       
        struct device_node *child;
        int rc = 0;
        
        root = root ? of_node_get(root) : of_find_node_by_path("/");
        if (!root)
                return -EINVAL;

        pr_debug("%s()\n", __func__);
        pr_debug(" starting at: %pOF\n", root);

        device_links_supplier_sync_state_pause();
        for_each_child_of_node(root, child) {
                rc = of_platform_bus_create(child, matches, lookup, parent, true);
                if (rc) {
                        of_node_put(child);
                        break;
                }
        }
        device_links_supplier_sync_state_resume();

        of_node_set_flag(root, OF_POPULATED_BUS);

        of_node_put(root);
        return rc;
}
EXPORT_SYMBOL_GPL(of_platform_populate)

Becuase the root device_node has been passed as NULL, it should first find the root node by traversing the device tree. The of_find_node_by_path will find the root node. After the root node has been find, it traverse every node in the tree with for_each_child_of_node macro. Note that it invokes of_platform_bus_create function for every children of the root which means entire nodes in the device tree. Although it passes 5 parameters, only child and matches parameters are meaningful because others are set as NULL.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
/**
 * of_platform_bus_create() - Create a device for a node and its children.
 * @bus: device node of the bus to instantiate
 * @matches: match table for bus nodes
 * @lookup: auxdata table for matching id and platform_data with device nodes
 * @parent: parent for new device, or NULL for top level.
 * @strict: require compatible property
 *
 * Creates a platform_device for the provided device_node, and optionally
 * recursively create devices for all the child nodes.
 */
static int of_platform_bus_create(struct device_node *bus,
                                  const struct of_device_id *matches,
                                  const struct of_dev_auxdata *lookup,
                                  struct device *parent, bool strict)
{
        const struct of_dev_auxdata *auxdata;
        struct device_node *child;
        struct platform_device *dev;
        const char *bus_id = NULL;
        void *platform_data = NULL;
        int rc = 0;

        /* Make sure it has a compatible property */
        if (strict && (!of_get_property(bus, "compatible", NULL))) {
                pr_debug("%s() - skipping %pOF, no compatible prop\n",
                         __func__, bus);
                return 0;
        }

        /* Skip nodes for which we don't want to create devices */
        if (unlikely(of_match_node(of_skipped_node_table, bus))) {
                pr_debug("%s() - skipping %pOF node\n", __func__, bus);
                return 0;
        }

        if (of_node_check_flag(bus, OF_POPULATED_BUS)) {
                pr_debug("%s() - skipping %pOF, already populated\n",
                        __func__, bus);
                return 0;
        }

        auxdata = of_dev_lookup(lookup, bus);
        if (auxdata) {
                bus_id = auxdata->name;
                platform_data = auxdata->platform_data;
        }

        if (of_device_is_compatible(bus, "arm,primecell")) {
                /*
                 * Don't return an error here to keep compatibility with older
                 * device tree files.
                 */
                of_amba_device_create(bus, bus_id, platform_data, parent);
                return 0;
        }

        dev = of_platform_device_create_pdata(bus, bus_id, platform_data, parent);
        if (!dev || !of_match_node(matches, bus))
                return 0;

        for_each_child_of_node(bus, child) {
                pr_debug("   create child: %pOF\n", child);
                rc = of_platform_bus_create(child, matches, lookup, &dev->dev, strict);
                if (rc) {
                        of_node_put(child);
                        break;
                }
        }
        of_node_set_flag(bus, OF_POPULATED_BUS);
        return rc;
}

Some readers might wonder why the name of the function is not “of_platform_device_create”, but bus. Also, remember that the matches parameter passed to this function is of_default_bus_match_table, and this is usually matches with a node that has compatible string names “simple-bus” in most cases.

Allocating and intializing platform device from device node

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
/**
 * of_platform_device_create_pdata - Alloc, initialize and register an of_device
 * @np: pointer to node to create device for
 * @bus_id: name to assign device
 * @platform_data: pointer to populate platform_data pointer with
 * @parent: Linux device model parent device.
 *
 * Returns pointer to created platform device, or NULL if a device was not
 * registered.  Unavailable devices will not get registered.
 */
static struct platform_device *of_platform_device_create_pdata(
                                        struct device_node *np,
                                        const char *bus_id,
                                        void *platform_data,
                                        struct device *parent)
{
        struct platform_device *dev;

        if (!of_device_is_available(np) ||
            of_node_test_and_set_flag(np, OF_POPULATED))
                return NULL;

        dev = of_device_alloc(np, bus_id, parent);
        if (!dev)
                goto err_clear_flag;

        dev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
        if (!dev->dev.dma_mask)
                dev->dev.dma_mask = &dev->dev.coherent_dma_mask;
        dev->dev.bus = &platform_bus_type;
        dev->dev.platform_data = platform_data;
        of_msi_configure(&dev->dev, dev->dev.of_node);

        if (of_device_add(dev) != 0) {
                platform_device_put(dev);
                goto err_clear_flag;
        }

        return dev;

err_clear_flag:
        of_node_clear_flag(np, OF_POPULATED);
        return NULL;
}

The above function allocates the platform_device (of_device_alloc) and assign the platform_bus bus_type to the generated device. After that, the generated device is registered to the bus sub-system with the device_add function that we covered before. The device_add function is invoked inside the of_device_add function. Let’s take a look at step by step from the device creation to its register.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
struct platform_device {
        const char      *name;
        int             id;
        bool            id_auto;
        struct device   dev;
        u64             platform_dma_mask;
        struct device_dma_parameters dma_parms;
        u32             num_resources;
        struct resource *resource;

        const struct platform_device_id *id_entry;
        char *driver_override; /* Driver name to force a match */

        /* MFD cell pointer */
        struct mfd_cell *mfd_cell;

        /* arch specific additions */
        struct pdev_archdata    archdata;
};


/**
 * of_device_alloc - Allocate and initialize an of_device
 * @np: device node to assign to device
 * @bus_id: Name to assign to the device.  May be null to use default name.
 * @parent: Parent device.
 */
struct platform_device *of_device_alloc(struct device_node *np,
                                  const char *bus_id,
                                  struct device *parent)
{
        struct platform_device *dev;
        int rc, i, num_reg = 0, num_irq;
        struct resource *res, temp_res;

        dev = platform_device_alloc("", PLATFORM_DEVID_NONE);
        if (!dev)
                return NULL;

        /* count the io and irq resources */
        while (of_address_to_resource(np, num_reg, &temp_res) == 0)
                num_reg++;
        num_irq = of_irq_count(np);

        /* Populate the resource table */
        if (num_irq || num_reg) {
                res = kcalloc(num_irq + num_reg, sizeof(*res), GFP_KERNEL);
                if (!res) {
                        platform_device_put(dev);
                        return NULL;
                }

                dev->num_resources = num_reg + num_irq;
                dev->resource = res;
                for (i = 0; i < num_reg; i++, res++) {
                        rc = of_address_to_resource(np, i, res);
                        WARN_ON(rc);
                }
                if (of_irq_to_resource_table(np, res, num_irq) != num_irq)
                        pr_debug("not all legacy IRQ resources mapped for %pOFn\n",
                                 np);
        }

        dev->dev.of_node = of_node_get(np);
        dev->dev.fwnode = &np->fwnode;
        dev->dev.parent = parent ? : &platform_bus;

        if (bus_id)
                dev_set_name(&dev->dev, "%s", bus_id);
        else
                of_device_make_bus_id(&dev->dev);

        return dev;
}
EXPORT_SYMBOL(of_device_alloc);

Compared to other devices registration, all required information to populate and manage the device is described in the device tree itself. Those information should be stored in the generated device structure to allow the device driver which will be bound to the current device to manage the device properly.

One device tree node can contain two important resource related to the device: register and interrupt. However, until the device tree has been parsed, we cannot know how many resource the device does have. Therefore, to allocate an array dynamically based on the number of resource, it needs number indication how many register and interrupt resources are available for the device. Based on that information, it allocates a resource array and copies all resource from the device tree to the resource array. After the resource has been successfully parsed and stored in the resource array of the platform device, it sets the device node of the current device to its of_node member field which will be required to access the device tree node in the device driver later. Also, it sets parent node as platform_bus when it is NULL.

After the platform_device object for the current device node has been properly allocated and set, it returns the device. The returned device is passed to of_device_add function, which register the generated device to the bus subsystem.

1
2
3
4
5
6
7
8
9
10
        dev = of_device_alloc(np, bus_id, parent);
        if (!dev)
                goto err_clear_flag;
        
        dev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
        if (!dev->dev.dma_mask)
                dev->dev.dma_mask = &dev->dev.coherent_dma_mask;
        dev->dev.bus = &platform_bus_type;
        dev->dev.platform_data = platform_data;
        of_msi_configure(&dev->dev, dev->dev.of_node);

Note that below code of the of_platform_device_create_pdata sets the bus of the generated platform device as platform_bus_type which has been registered as bus for platform devices at platform_bus_init function.

Adding generated platform_device

Then let’s take a look at how the generated platform device can be added to the driver sub-system.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
int of_device_add(struct platform_device *ofdev)
{
        BUG_ON(ofdev->dev.of_node == NULL);

        /* name and id have to be set so that the platform bus doesn't get
         * confused on matching */          
        ofdev->name = dev_name(&ofdev->dev);
        ofdev->id = PLATFORM_DEVID_NONE;

        /*
         * If this device has not binding numa node in devicetree, that is
         * of_node_to_nid returns NUMA_NO_NODE. device_add will assume that this
         * device is on the same node as the parent.
         */                                      
        set_dev_node(&ofdev->dev, of_node_to_nid(ofdev->dev.of_node));
                                                 
        return device_add(&ofdev->dev);
}  

The of_device_add function is kind of a wrapper function that sets some fields of the platform device for one device node and just invokes the device_add function with its device structure embedded in the platform_device structure.

Now it’s time to revisit device_add function once again. Compared to when the device_add function is invoked to add the platform bus as device to the driver system, current device_add function is invoked to add the platform device to the platform bus. Becuase most details are already covered, we will highlights some part of it related to registering the device to the bus.

In the middle of device_add function, it invokes bus_add_device function with the passed device structure.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
int bus_add_device(struct device *dev)
{       
        struct bus_type *bus = bus_get(dev->bus);
        int error = 0;

        if (bus) {
                pr_debug("bus: '%s': add device %s\n", bus->name, dev_name(dev));
                error = device_add_groups(dev, bus->dev_groups);
                if (error)
                        goto out_put;
                error = sysfs_create_link(&bus->p->devices_kset->kobj,
                                                &dev->kobj, dev_name(dev));
                if (error)
                        goto out_groups;
                error = sysfs_create_link(&dev->kobj,
                                &dev->bus->p->subsys.kobj, "subsystem");
                if (error)
                        goto out_subsys;
                klist_add_tail(&dev->p->knode_bus, &bus->p->klist_devices);
        }
        return 0;
        
out_subsys:
        sysfs_remove_link(&bus->p->devices_kset->kobj, dev_name(dev));
out_groups:     
        device_remove_groups(dev, bus->dev_groups);
out_put:                
        bus_put(dev->bus);
        return error; 
}   

Compared to previous device_add, which doesn’t have a bus field because it was a platform device itself, current platfrom device’s device has bus which is a platform_bus_type. After invoking several functions to register the device to the sysfs, klist_add_tail(&dev->p->knode_bus, &bus->p->klist_devices) macro adds the current device to the klist_devices klist which are managed by the target bus. Note that the private field of the bus is used to register the device to the bus subsystem.

Remember that we allocated a private platform bus subsystem using the platform_bus_type when the bus_register function has been invoked at the platform_bus_init. Because platform_bus_type is a global structure and has been initialized to have private sub-system, whenever any device who wants to be attached to that bus should make the bus member field of the device to reference platform_bus_type.

Binding the device to the driver

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
/**      
 * bus_probe_device - probe drivers for a new device
 * @dev: device to probe 
 *       
 * - Automatically probe for a driver if the bus allows it.
 */      
void bus_probe_device(struct device *dev)
{               
        struct bus_type *bus = dev->bus;
        struct subsys_interface *sif;

        if (!bus)
                return;
                
        if (bus->p->drivers_autoprobe)
                device_initial_probe(dev);
        
        mutex_lock(&bus->p->mutex);
        list_for_each_entry(sif, &bus->p->interfaces, node)
                if (sif->add_dev)
                        sif->add_dev(dev, sif);
        mutex_unlock(&bus->p->mutex);
}               

After the device has been registered to the platform_bus bus type, the device can be bound to the corresponding driver if possible. We didn’t cover this function before because previous platform_bus device doesn’t have bus field and it just returned instead of trying to bind the driver to the device. However, current platform device has the bus, it can ask the bus to find the driver associated with current device.

drivers/base/dd.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
void device_initial_probe(struct device *dev)
{
        __device_attach(dev, true);
}

static int __device_attach(struct device *dev, bool allow_async)
{
        int ret = 0;

        device_lock(dev);
        if (dev->p->dead) {
                goto out_unlock;
        } else if (dev->driver) {
                if (device_is_bound(dev)) {
                        ret = 1;
                        goto out_unlock;
                }
                ret = device_bind_driver(dev);
                if (ret == 0)
                        ret = 1;
                else {
                        dev->driver = NULL;
                        ret = 0;
                }
        } else {
                struct device_attach_data data = {
                        .dev = dev,
                        .check_async = allow_async,
                        .want_async = false,
                };

                if (dev->parent)
                        pm_runtime_get_sync(dev->parent);

                ret = bus_for_each_drv(dev->bus, NULL, &data,
                                        __device_attach_driver);
                if (!ret && allow_async && data.have_async) {
                        /*
                         * If we could not find appropriate driver
                         * synchronously and we are allowed to do
                         * async probes and there are drivers that
                         * want to probe asynchronously, we'll
                         * try them.
                         */
                        dev_dbg(dev, "scheduling asynchronous probe\n");
                        get_device(dev);
                        async_schedule_dev(__device_attach_async_helper, dev);
                } else {
                        pm_request_idle(dev);
                }

                if (dev->parent)
                        pm_runtime_put(dev->parent);
        }
out_unlock:
        device_unlock(dev);
        return ret;
}

Because our device is not dead, the else statement should be executed, and bus_for_each_drv function will run __device_attach_driver function against all drivers managed by the bus.

Before the iteration starts, it sets the data which contains the device that we are trying to register so that the __device_attach_driver function can have access on the device. And the function traverse the klist_drivers of the bus and select one driver registered to the bus one by one and pass it to the __device_attach_driver function. As a result, the function can have access not only on the device but also the possible candidate driver that might manage our device.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
static int __device_attach_driver(struct device_driver *drv, void *_data)
{
        struct device_attach_data *data = _data;
        struct device *dev = data->dev;
        bool async_allowed;
        int ret;

        ret = driver_match_device(drv, dev);
        if (ret == 0) {
                /* no match */
                return 0;
        } else if (ret == -EPROBE_DEFER) {
                dev_dbg(dev, "Device match requests probe deferral\n");
                driver_deferred_probe_add(dev);
        } else if (ret < 0) {
                dev_dbg(dev, "Bus failed to match device: %d\n", ret);
                return ret;
        } /* ret > 0 means positive match */

        async_allowed = driver_allows_async_probing(drv);

        if (async_allowed)
                data->have_async = true;

        if (data->check_async && async_allowed != data->want_async)
                return 0;

        return driver_probe_device(drv, dev);
}

static inline int driver_match_device(struct device_driver *drv,
                                      struct device *dev)
{
        return drv->bus->match ? drv->bus->match(dev, drv) : 1;
}

Note that the currently selected driver, passed as the first argument of the __device_attach_driver function is just one of the drivers registered to the bus. Therefore, first job of this function is trying to figure out if the currently selected driver has capability to manage our device. % To achieve this, it invokes the driver_match_device function with the selected driver and device object. This function is a simple macro that invokes the match function of the bus attached to the device.

Platform device match function

1
2
3
4
5
6
7
8
9
10
11
struct bus_type platform_bus_type = {
        .name           = "platform",
        .dev_groups     = platform_dev_groups,
        .match          = platform_match,
        .uevent         = platform_uevent,
        .probe          = platform_probe,
        .remove         = platform_remove,
        .shutdown       = platform_shutdown,
        .dma_configure  = platform_dma_configure,
        .pm             = &platform_dev_pm_ops,
};

In this case, we assign the platform_bus_type as our platform device’s bus, its match function, platform_match, should be invoked.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
/**
 * platform_match - bind platform device to platform driver.
 * @dev: device.
 * @drv: driver.
 *
 * Platform device IDs are assumed to be encoded like this:
 * "<name><instance>", where <name> is a short description of the type of
 * device, like "pci" or "floppy", and <instance> is the enumerated
 * instance of the device, like '0' or '42'.  Driver IDs are simply
 * "<name>".  So, extract the <name> from the platform_device structure,
 * and compare it against the name of the driver. Return whether they match
 * or not.
 */
static int platform_match(struct device *dev, struct device_driver *drv)
{
        struct platform_device *pdev = to_platform_device(dev);
        struct platform_driver *pdrv = to_platform_driver(drv);

        /* When driver_override is set, only bind to the matching driver */
        if (pdev->driver_override)
                return !strcmp(pdev->driver_override, drv->name);

        /* Attempt an OF style match first */
        if (of_driver_match_device(dev, drv))
                return 1;

        /* Then try ACPI style match */
        if (acpi_driver_match_device(dev, drv))
                return 1;

        /* Then try to match against the id table */
        if (pdrv->id_table)
                return platform_match_id(pdrv->id_table, pdev) != NULL;

        /* fall-back to driver name match */
        return (strcmp(pdev->name, drv->name) == 0);
}

When the platform_match function is invoked, it firstly translates the generic drvier and device to platform driver and platform device. Even though each bus utilize different type of device and driver following the detailed implementation of the bus, because the probe function is invoked through a generic interface of linux driver system, the match function of the bus should have generic type for device and driver, and tralsate them to the bus specific ones.

After that, it invokes different match functions because platform device can be registered through multiple different methods. We have a platform device detected and generated by the device file, so of_driver_match function should match the device and driver using the compatible strings specified in the device driver supporting device tree. When the driver does not support the device tree, then other matching functions should be in charge of device-to-driver matching.

Binding the matching driver and device

If the matching function finds a driver claiming that it can support currently being registered device, __device_attach_driver function invokes the driver_probe_device function which finally binds the device and its corresponding driver.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
/**
 * driver_probe_device - attempt to bind device & driver together
 * @drv: driver to bind a device to
 * @dev: device to try to bind to the driver
 *
 * This function returns -ENODEV if the device is not registered,
 * 1 if the device is bound successfully and 0 otherwise.
 *
 * This function must be called with @dev lock held.  When called for a
 * USB interface, @dev->parent lock must be held as well.
 *
 * If the device has a parent, runtime-resume the parent before driver probing.
 */
static int driver_probe_device(struct device_driver *drv, struct device *dev)
{
        int ret = 0;

        if (!device_is_registered(dev))
                return -ENODEV;

        pr_debug("bus: '%s': %s: matched device %s with driver %s\n",
                 drv->bus->name, __func__, dev_name(dev), drv->name);

        pm_runtime_get_suppliers(dev);
        if (dev->parent)
                pm_runtime_get_sync(dev->parent);

        pm_runtime_barrier(dev);
        if (initcall_debug)
                ret = really_probe_debug(dev, drv);
        else
                ret = really_probe(dev, drv);
        pm_request_idle(dev);

        if (dev->parent)
                pm_runtime_put(dev->parent);

        pm_runtime_put_suppliers(dev);
        return ret;
}

To bind the driver to the device, the matching driver should be invoked. The probe function of the matching driver provides an entry function for this binding process. As shown in the code, when it is not in debug mode, the really_probe function will be invoked, and will call the probe function of the matching driver.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
static int really_probe(struct device *dev, struct device_driver *drv)
{
        int ret = -EPROBE_DEFER;
        int local_trigger_count = atomic_read(&deferred_trigger_count);
        bool test_remove = IS_ENABLED(CONFIG_DEBUG_TEST_DRIVER_REMOVE) &&
                           !drv->suppress_bind_attrs;

        if (defer_all_probes) {
                /*
                 * Value of defer_all_probes can be set only by
                 * device_block_probing() which, in turn, will call
                 * wait_for_device_probe() right after that to avoid any races.
                 */
                dev_dbg(dev, "Driver %s force probe deferral\n", drv->name);
                driver_deferred_probe_add(dev);
                return ret;
        }

        ret = device_links_check_suppliers(dev);
        if (ret == -EPROBE_DEFER)
                driver_deferred_probe_add_trigger(dev, local_trigger_count);
        if (ret)
                return ret;

        atomic_inc(&probe_count);
        pr_debug("bus: '%s': %s: probing driver %s with device %s\n",
                 drv->bus->name, __func__, drv->name, dev_name(dev));
        if (!list_empty(&dev->devres_head)) {
                dev_crit(dev, "Resources present before probing\n");
                ret = -EBUSY;
                goto done;
        }

re_probe:
        dev->driver = drv;

        /* If using pinctrl, bind pins now before probing */
        ret = pinctrl_bind_pins(dev);
        if (ret)
                goto pinctrl_bind_failed;

        if (dev->bus->dma_configure) {
                ret = dev->bus->dma_configure(dev);
                if (ret)
                        goto probe_failed;
        }

        if (driver_sysfs_add(dev)) {
                pr_err("%s: driver_sysfs_add(%s) failed\n",
                       __func__, dev_name(dev));
                goto probe_failed;
        }

        if (dev->pm_domain && dev->pm_domain->activate) {
                ret = dev->pm_domain->activate(dev);
                if (ret)
                        goto probe_failed;
        }

        if (dev->bus->probe) {
                ret = dev->bus->probe(dev);
                if (ret)
                        goto probe_failed;
        } else if (drv->probe) {
                ret = drv->probe(dev);
                if (ret)
                        goto probe_failed;
        }

        if (device_add_groups(dev, drv->dev_groups)) {
                dev_err(dev, "device_add_groups() failed\n");
                goto dev_groups_failed;
        }

        if (dev_has_sync_state(dev) &&
            device_create_file(dev, &dev_attr_state_synced)) {
                dev_err(dev, "state_synced sysfs add failed\n");
                goto dev_sysfs_state_synced_failed;
        }

        if (test_remove) {
                test_remove = false;

                device_remove_file(dev, &dev_attr_state_synced);
                device_remove_groups(dev, drv->dev_groups);

                if (dev->bus->remove)
                        dev->bus->remove(dev);
                else if (drv->remove)
                        drv->remove(dev);

                devres_release_all(dev);
                driver_sysfs_remove(dev);
                dev->driver = NULL;
                dev_set_drvdata(dev, NULL);
                if (dev->pm_domain && dev->pm_domain->dismiss)
                        dev->pm_domain->dismiss(dev);
                pm_runtime_reinit(dev);

                goto re_probe;
        }

        pinctrl_init_done(dev);

        if (dev->pm_domain && dev->pm_domain->sync)
                dev->pm_domain->sync(dev);

        driver_bound(dev);
        ret = 1;
        pr_debug("bus: '%s': %s: bound device %s to driver %s\n",
                 drv->bus->name, __func__, dev_name(dev), drv->name);
        goto done;

dev_sysfs_state_synced_failed:
        device_remove_groups(dev, drv->dev_groups);
dev_groups_failed:
        if (dev->bus->remove)
                dev->bus->remove(dev);
        else if (drv->remove)
                drv->remove(dev);
probe_failed:
        if (dev->bus)
                blocking_notifier_call_chain(&dev->bus->p->bus_notifier,
                                             BUS_NOTIFY_DRIVER_NOT_BOUND, dev);
pinctrl_bind_failed:
        device_links_no_driver(dev);
        devres_release_all(dev);
        arch_teardown_dma_ops(dev);
        driver_sysfs_remove(dev);
        dev->driver = NULL;
        dev_set_drvdata(dev, NULL);
        if (dev->pm_domain && dev->pm_domain->dismiss)
                dev->pm_domain->dismiss(dev);
        pm_runtime_reinit(dev);
        dev_pm_set_driver_flags(dev, 0);

        switch (ret) {
        case -EPROBE_DEFER:
                /* Driver requested deferred probing */
                dev_dbg(dev, "Driver %s requests probe deferral\n", drv->name);
                driver_deferred_probe_add_trigger(dev, local_trigger_count);
                break;
        case -ENODEV:
        case -ENXIO:
                pr_debug("%s: probe of %s rejects match %d\n",
                         drv->name, dev_name(dev), ret);
                break;
        default:
                /* driver matched but the probe failed */
                pr_warn("%s: probe of %s failed with error %d\n",
                        drv->name, dev_name(dev), ret);
        }
        /*
         * Ignore errors returned by ->probe so that the next driver can try
         * its luck.
         */
        ret = 0;
done:
        atomic_dec(&probe_count);
        wake_up_all(&probe_waitqueue);
        return ret;
}

Although there are many complex details in the really_probe function, our interest is only when the probe of the matching driver is invoked. When you look at the middle of the function, you can find that below code block invokes the probe function of the bus or the driver based on condition.

1
2
3
4
5
6
7
8
9
        if (dev->bus->probe) {
                ret = dev->bus->probe(dev);
                if (ret)
                        goto probe_failed;
        } else if (drv->probe) {
                ret = drv->probe(dev);
                if (ret)
                        goto probe_failed;
        }

Because our bus, platform_bus_type has its own probe function, platform_probe, the probe function of the bus should be called instead of invoking the driver’s probe function directly. Let’s see the detailed implementation of the probe function of our bus, platform_probe.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
static int platform_probe(struct device *_dev)
{
        struct platform_driver *drv = to_platform_driver(_dev->driver);
        struct platform_device *dev = to_platform_device(_dev);
        int ret;

        /*
         * A driver registered using platform_driver_probe() cannot be bound
         * again later because the probe function usually lives in __init code
         * and so is gone. For these drivers .probe is set to
         * platform_probe_fail in __platform_driver_probe(). Don't even prepare
         * clocks and PM domains for these to match the traditional behaviour.
         */
        if (unlikely(drv->probe == platform_probe_fail))
                return -ENXIO;

        ret = of_clk_set_defaults(_dev->of_node, false);
        if (ret < 0)
                return ret;

        ret = dev_pm_domain_attach(_dev, true);
        if (ret)
                goto out;

        if (drv->probe) {
                ret = drv->probe(dev);
                if (ret)
                        dev_pm_domain_detach(_dev, true);
        }

out:
        if (drv->prevent_deferred_probe && ret == -EPROBE_DEFER) {
                dev_warn(_dev, "probe deferral not supported\n");
                ret = -ENXIO;
        }

        return ret;
}

Although we only have access to the device, because we already register the matching driver to device’s driver field before we can retrieve the matching driver’s object (check the re_probe jump flag of the really_probe function). Because platform_probe function is a generic wrapper probe for all platform devices, it invokes several functions to manage the device as platform device such as attaching power domain or setting the clk for the device. After those generic settings are done, the real probe function of the matching driver is invoked. Although, the platform_probe function only passes the platform device object to the probe function, different buses can support different prototype of probe function. In that case the bus’ probe function will feed those operands before the driver’s probe function is invoked.

For the probe function of the matching driver, you should take a look at the implementation of the probe function in the corresponding device driver. We are not going to take a look at probe function of one particular device in this posting. After the probing function of the matching driver is invoked, rest part of the device_register function.

This post is licensed under CC BY 4.0 by the author.

Comments powered by Disqus.