Monday, 10 July 2017

Getting list of network devices inside the Linux kernel

#include <linux/netdevice.h>

struct net_device *dev;

read_lock(&dev_base_lock);

dev = first_net_device(&init_net);
while (dev) {
    printk(KERN_INFO "found [%s]\n", dev->name);
    dev = next_net_device(dev);
}

read_unlock(&dev_base_lock);

Network Subsystem initialization call flow in linux

Network subsystem in Linux /net/core/dev.c file

1.  net_dev_init()(Initialize the DEV module. At boot time this walks the device list and unhooks any devices that fail to initialise (normally hardware not present) and leaves us with a valid list of present and active devices.)
     1. dev_proc_init() it is dedine in /net/core/net_procfs.c it create poc file system of network dev,statistic, softnet and pakctes.
     2. netdev_kobject_init() it initilize network attribute of network device.
     3.  register_pernet_subsys() it initilize the network queues
     4. skb_queue_head_init() it's initialize input packet queue for each CPU.
     http://elixir.free-electrons.com/linux/latest/source/include/linux/netdevice.h#L2755
       struct softnet_data {
     struct list_head    poll_list;
     struct sk_buff_head    process_queue;

    /* stats */
    unsigned int        processed;
    unsigned int        time_squeeze;
    unsigned int        received_rps;
  #ifdef CONFIG_RPS
    struct softnet_data    *rps_ipi_list;
 #endif
 #ifdef CONFIG_NET_FLOW_LIMIT
    struct sd_flow_limit __rcu *flow_limit;
 #endif
    struct Qdisc        *output_queue;
    struct Qdisc        **output_queue_tailp;
    struct sk_buff        *completion_queue;

 #ifdef CONFIG_RPS
    /* input_queue_head should be written by cpu owning this struct,
     * and only read by other cpus. Worth using a cache line.
     */
    unsigned int        input_queue_head ____cacheline_aligned_in_smp;

    /* Elements below can be accessed between CPUs for RPS/RFS */
    struct call_single_data    csd ____cacheline_aligned_in_smp;
    struct softnet_data    *rps_ipi_next;
    unsigned int        cpu;
    unsigned int        input_queue_tail;
 #endif
    unsigned int        dropped;
    struct sk_buff_head    input_pkt_queue;
    struct napi_struct    backlog;

 };
     5. skb_queue_head_init() it initialize process queue for each CPU.
     6. register_pernet_device(&loopback_net_ops) (The loopback device is special if any other network devices is present in a network namespace the loopback device must be present. Since we now dynamically allocate and free the
      loopback device ensure this invariant is maintained by keeping the loopback device as the first device on the  list of network devices.  Ensuring the loopback devices is the first device that appears and the last network device that disappears.)
     7. register_pernet_device(&default_device_ops). Initialize default device operations.
     8.open_softirq(NET_TX_SOFTIRQ, net_tx_action) and open_softirq(NET_RX_SOFTIRQ, net_rx_action). Register Tx and Rx soft IQR for network subsystem.
     9.dst_subsys_init() It register netowrk netifier call back functions.
   
   
    

Thursday, 6 July 2017

Linux device driver loading procedure in boot up

// driver call flow
start_kernel()
   |
rest_init()
   |
kernel_thread(kernel_init, NULL, CLONE_FS);
   |
kernel_init()
   |
kernel_init_freeable()
   |
do_basic_setup()(the machine is now initialized. None of the devices have been touched yet, but the CPU subsystem is up and running, and memory and process management works.)
   |
   1.driver_init() //drivers/base/init.c (Call the driver model init functions to initialize their subsystems. Called early from init/main.c)
   2.do_initcalls() The purpose of this loop is to execute each of the init functions corresponding to each of the initcall levels (All built-in modules initialized with module_init () are represented by initcall level 6).


It depends on whether the driver is a built-in module or compiled as a loadable module. I'll be talking about a built-in module in this answer (one with y in .config):

module_init expands to  #define module_init(x)  __initcall(x);

which then expands to

    
#define __initcall(fn) device_initcall(fn)
#define device_initcall(fn)             __define_initcall(fn, 6)

If your driver's initialization routine is called strtdrv then  #define device_initcall(fn) __define_initcall(fn, 6) becomes  #define device_initcall(fn) __define_initcall(strtdrv, 6)

_define_initcall expands to

    
    #define __define_initcall(fn, id) \
    static initcall_t __initcall_##fn##id __used \
    __attribute__((__section__(".initcall" #id ".init"))) = fn



which means we now have

       static initcall_t __initcall_strtdrv6 __used __attribute__((__section__(".initcall6.init"))) = strtdrv;
   
  
A new symbol is created, called __initcall_strtdrv6, which is inserted into the ELF section called .initcall6.init, which points to a routine called strtdrv.

If we take a look at init/main.c, do_basic_setup() has a call to do_initcalls().

static void __init do_initcalls(void)
{
    int level;

    for (level = 0; level < ARRAY_SIZE(initcall_levels) - 1; level++)
        do_initcall_level(level);
}


The purpose of this loop is to execute each of the init functions corresponding to each of the initcall levels (All built-in modules initialized with module_init () are represented by initcall level 6).

We will now expand do_initcall_level(level) and focus on a particular chunk of code:

static void __init do_initcall_level(int level)
{
    /* some code */
    initcall_t *fn;
    for (fn = initcall_levels[level]; fn < initcall_levels[level+1]; fn++)
    do_one_initcall(*fn);
    /* some code */
}

The function pointer *fn is pointed to first function pointer registered within each of the ELF sections and is incremented by the size of fn* ( sizeof(initcall_t *)) until the end of the ELF section is reached and for each step taken, the pointer is called and the init function is executed, so in our case, do_one_initcall() will simply call the driver's initialization routine strtdrv()

    int __init_or_module do_one_initcall(initcall_t fn) {
    /* some code */
    ret = fn(); // which in our case is strtdrv(), so ret = strtdrv();
    /* some code */



and what happens next depends on the routine's code. After the initialization is done, an architecture specific function called free_initmem is called to clean up the memory pages consumed by the init functions and its data.


Interrupt handling in ARM

We'll cover the whole interrupt stuff in two sections:

    Interrupt setup - Explanation of Generic and architecture specific setup that kernel does.

    Interrupt handling - Explanation of what happens after processor recieves an interrupt.

1. Interrupt setup
start_kernel( ) is the first 'C' function that opens its eyes when kernel is booting up. It intializes various subsystems of the kernel, including IRQ system. Intialization of IRQ requires that you have valid vector table in place and you have first level interrupt hadlers in place, both of these things are architecture specifc. Lets setup the vector table first.

 start_kernel()
        |
 setup_arch()
       |
 page_init()
     |
 devicemaps_init() (Set up the device mappings.  Since we clear out the page tables for all mappings above VMALLOC_START, except early fixmap, we might remove debug
  device mappings.  This means earlycon can be used to debug this function any other function or debugging method which may touch any device _will_ crash the kernel.)
      |
  early_trap_init

start_kernel( ) calls a function called early_trap_init( ) which does following:
  •   setup exception vector table at location 0xffff0000. 
  •   Flush the icache in range 0xffff0000 to 0xffff0000 + PAGE_SIZE. This is required because memcpy() funcs moves the vector table and vector stubs to 0xffff0000. 
      /* * Copy the vectors, stubs and kuser helpers (in entry-armv.S)
         * into the vector page, mapped at 0xffff0000, and ensure these
         * are visible to the instruction stream.
         */
        memcpy((void *)vectors, __vectors_start, __vectors_end - __vectors_start);
        memcpy((void *)vectors + 0x200, __stubs_start, __stubs_end - __stubs_start);
        memcpy((void *)vectors + 0x1000 - kuser_sz, __kuser_helper_start, kuser_sz);
  flush_icache_range(vectors, vectors + PAGE_SIZE); 
 Vector table and vector stub code for ARM resides in arch/arm/kernel/entry-armv.S file 
__vectors_start:
 ARM(   swi     SYS_ERROR0      )
 THUMB( svc     #0              )
 THUMB( nop                     )
        W(b)    vector_und + stubs_offset
        W(ldr)  pc, .LCvswi + stubs_offset
        W(b)    vector_pabt + stubs_offset
        W(b)    vector_dabt + stubs_offset
        W(b)    vector_addrexcptn + stubs_offset
       
W(b)    vector_irq + stubs_offset
        W(b)    vector_fiq + stubs_offset

        .globl  __vectors_end
__vectors_end:

 

 As we can see this vector contains branch instruction for branching to exception handler code which also resides in same file (arm-entryV.S). 
Vector table contains the branch instructions for all the exceptions defined in ARM (Undefined instruction, SWI, data abort, prefecth abort, IRQ, and FIQ). The most important thing to note about this vector table is that branch instructions are used for all exceptions except SWI. Using branch instruction instead of loading PC directly with the exceptions handler address makes this code position independent. Since branch instruction take offset (+ive or -ve) from current PC, this code will run fine as long as the offset between vector table instructions and the exception handlers is maintained as desired by this code. And It is assumed here that exception handlers will be at +0x200 offset from starting address of vector table. 
So we are done with setting up vector tables and the exception handlers. If you want then you can hook your exception handler directly to the vector table, so that you bypass all linux interrupt handling code, which is pretty heavy. But if you do so then you'll have to get your hands dirty with all the architecture details which kernel handles beautifully and cleanly.
After setting up vector tables start_kernel ( ) calls init_IRQ() to set up kernel IRQ handling infrastructure, on ARM we have 32 hard interrupts for which kernel kernel sets up the a default desctiptor called bad_irq_desc, which has do_bad_IRQ( ) as IRQ handler. Then init_IRQ( ) calls init_arch_irq( ), here the architecture specfic code has to setup the IRQ handlers for 32 IRQs, if not set then do_bad_IRQ( ) will handle the IRQs.


start_kernel()  init/main.c
        |
  init_IRQ()      arch/arm/kernel/irq.c
       |
mdesc->init_irq() 
      |
gic_init_irq() //arch/arm/mach-omap2/boardxx.c(part of func ptrs in MACHINE_STARTmacro) 
     |
gic_init(0,29,base_addr,cpu_base)  //arch/arm/common/gic.c
     |
gic_dist_init(gic, irq_start)
git_cpu_init(gic)

gic_dist_init(gic,irq_start)
{
  for(i=irq_start; i < irq_limit; i++)  //All the interrupts and their handlers registered here
  {
    irq_set_chip_and_handler(i, &gic_chip, handle_fasteoi_irq);  //This is where handler is registered for each interrupt line
    irq_set_chip_data(i, gic);
    set_irq_flags(i, IRQF_VALID | IRQF_PROBE);
}

struct irq_desc {
irq_flow_handler_t handle_irq;  //flow handler
struct irq_data         irq_data;
 struct irqaction        *action;
unsigned int            depth;
const char              *name;
}

handle_irq is called by the architecture-specific code whenever an interrupt occurs. The function is then responsible to use the controller-specific methods provided in chip to perform the necessary low-level actions required to process the interrupt.

handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
           |
 handle_irq_event(desc);
           |
 handle_irq_event_percpu(desc, action); 
  {
     do {
            res = action->handler(irq, action->dev_id); // your interrupt handler gets called
             action = action->next;
        } while (action);    
}

So now we have our IRQ infrastructure in place, and various modules can register thier IRQ handlers through request_irq(). When you call request_irq( ) kernel appends your IRQ handler to list of IRQ handlers registered for that particular IRQ line, it does not change the exception vector table.

Now lets see what happens after interrupt is recieved. 

2. Interrupt Handling
When a IRQ is raised, ARM stops what it is processing ( Asuming it is not processing a FIQ!), disables further IRQs (not FIQs), puts CPSR in SPSR, puts current PC to LR and swithes to IRQ mode, refers to the vector table and jumps to the exception handler. In our case it jumps to the exception handler of IRQ.  

when interrupt occurs, jump to the location of b vector_irq + stubs_offset implementation. Note that the current scale of the initial position to 0xffff0000.  
    

And it jumps to the 
  .globl  __stubs_start
__stubs_start:

 vector_stub     irq, IRQ_MODE, 4
.long   __irq_usr                       @  0  (USR_26 / USR_32)
.long   __irq_svc                       @  3  (SVC_26 / SVC_32)
......................
vector_stub macro is defined as
/* Vector stubs.
 * This code is copied to 0xffff0200 so we can use branches in the
 * vectors, rather than ldr's.  Note that this code must not
 * exceed 0x300 bytes.
 *
 * Common stub entry macro:
 * Enter in IRQ mode, spsr = SVC/USR CPSR, lr = SVC/USR PC

.macro  vector_stub, name, mode, correction=0

vector_\name:
       @
        @ Prepare for SVC32 mode.  IRQs remain disabled.
        @
        mrs     r0, cpsr
        eor     r0, r0, #(\mode ^ SVC_MODE | PSR_ISETSTATE)
        msr     spsr_cxsf, r0
 
 ARM(   ldr     lr, [pc, lr, lsl #2]    )
        movs    pc, lr                  @ branch to handler in SVC mode
ENDPROC(vector_\name)

with "irq, IRQ_MODE, 4" instead of macro vector_stub the "name, mode, correction", to find the entrance to interrupt our position vector_irq (macro inside the vector_ \ name

The program will jump to the next step _irq_usr, or __irq_svc and other locations.

__irq_svc:
        svc_entry
#ifdef CONFIG_PREEMPT
        get_thread_info tsk
        ldr     r8, [tsk, #TI_PREEMPT]          @ get preempt count
        add     r7, r8, #1                                @ increment it
        str     r7, [tsk, #TI_PREEMPT]
#endif
       irq_handler
    svc_exit r4                             @ return from exception
 UNWIND(.fnend          )
ENDPROC(__irq_svc)

svc_entry macro:
/*
 * SVC mode handlers
 */
      .macro  svc_entry, stack_hole=0
       ldmia   r0, {r1 - r3}
        add     r5, sp, #S_SP - 4       @ here for interlock avoidance
        mov     r4, #-1                 @  ""  ""      ""       ""
        add     r0, sp, #(S_FRAME_SIZE + \stack_hole - 4)
        SPFIX( addeq   r0, r0, #4      )
        str     r1, [sp, #-4]!          @ save the "real" r0 copied
                                             @ from the exception stack

        mov     r1, lr
       stmia   r5, {r0 - r4}
        .endm

irq_handler the implementation process:
/*Interrupt handling.  Preserves r7, r8, r9 */
        .macro  irq_handler
          arch_irq_handler_default
                |
.macro  arch_irq_handler_default    //arch/arm/include/asm/entry-macro-multi.S
        get_irqnr_preamble r5, lr
1:      get_irqnr_and_base r0, r6, r5, lr
        movne   r1, sp
        @
        @ routine called with r0 = irq number, r1 = struct pt_regs *
        @
        adrne   lr, BSYM(1b)
        bne     asm_do_IRQ (jump to the kernel)

------------>---------->-------->------------>

          .macro  get_irqnr_preamble, base, tmp  // to get the IRQ base address
            ldr     \base, =OMAPX_IRQ_BASE

  .macro  get_irqnr_and_base, irqnr, irqstat, base, tmp   //finds the interrupt number
                ldr     \irqnr, [\base, #0x98] /* IRQ pending reg 1 */
                cmp     \irqnr, #0x0
                bne     9999f
                ldr     \irqnr, [\base, #0xb8] /* IRQ pending reg 2 */
                cmp     \irqnr, #0x0
                bne     9999f
                ldr     \irqnr, [\base, #0xd8] /* IRQ pending reg 3 */
                cmp     \irqnr, #0x0
                bne     9999f
                ldr     \irqnr, [\base, #0xf8] /* IRQ pending reg 4 */
                cmp     \irqnr, #0x0
9999:
                ldrne   \irqnr, [\base, #INTCPS_SIR_IRQ_OFFSET]
                and     \irqnr, \irqnr, #ACTIVEIRQ_MASK /* Clear spurious bits */


                .endm

The last step is now you have got the interrupt number in the register JUMP to the KERNEL
 bne     asm_do_IRQ (jump to the kernel)


Arch specific things are done, now we will move to ARCH Independent things.
 asm_do_IRQ(unsigned int irq, struct pt_regs *regs)
             |
 generic_handle_irq(irq);  //arch/arm/kernel/irq.c
            |
generic_handle_irq_desc(irq, desc);  //kernel/irq/irqdesc.c 
           |
desc->handle_irq(irq,desc) //This is the actual call for the flow handler which we registered as handle_fasteoi_irq
           |
           
handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
           |
 handle_irq_event(desc);
           |
 handle_irq_event_percpu(desc, action); 
  {
     do {
            res = action->handler(irq, action->dev_id); // your interrupt handler gets called
             action = action->next;
        } while (action);    
}