My final Python project. Looking for general feedback


#1

For my final project i've written some code to collect tv-guide data from a website, sort it chronologically and match the information with a list of terms/programs I am interested in.

I thought I'd ask for some general feedback because there's probably lot's to improve. I don't expect anyone to check all of the code, but there are probably some things that stand out that could easily be improved.

First file (fetch_text.py, this is the part that uses scrapy):

from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
from scrapy import optional_features
optional_features.remove('boto')

#setting the filename of the text file
filename = "fetched_text"
filename2 = "fetched_channels"
#creating a clean text files (or clearing the old ones)
with open(filename, 'wb') as f:
    f.write("")
with open(filename2, 'wb') as f:
    f.write("")

class TVSpider(CrawlSpider):
    name = "tv"
    allowed_domains = ["http://www.tvgids.tv", "www.tvgids.tv"]
    start_urls = ["http://www.tvgids.tv/primetime"]

    def parse(self, response):
        #getting text (program info) from page, then cleaning it up and changing encoding
        messy_content = (response.xpath('//div[@class="section"]//text()').extract())
        cleaner_content = "".join(messy_content)
        clean_content = cleaner_content.encode("utf8", "replace")
        
        #removing all tabs
        clean_content = clean_content.replace("\t", "")
        
        with open(filename, 'ab') as f:
            f.write(clean_content)
        
        #getting text from page (channel info), then cleaning it up and changing encoding
        messy_content2 = (response.xpath('//div[@class="section-title"]//text()').extract())
        cleaner_content2 = "".join(messy_content2)
        clean_content2 = cleaner_content2.encode("utf8", "ignore")
        
        #removing all tabs
        clean_content2 = clean_content2.replace("\t", "")
        
        with open(filename2, 'ab') as f:
            f.write(clean_content2)


#Starting the Crawler
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(TVSpider)
process.start() # the script will block here until the crawling is finished

Second file (run.py):

#This program allows a user to display tv-guide information and match it with a "hitlist"
#The program information is gathered using scrapy

print ""
print "initialising, please wait...."

import re
import fetch_text #Importing this file executes it. It needs to be executed, but i don know why it happens automatically.

#This function makes a list out of the channels that were stored in a file by fetch_text
def get_channels(filename):
    clean_channel_list = []
    with open(filename, 'r') as f:
        channel_list = f.readlines()
    #cleaning the list (removing \n and the first line)
    for item in channel_list:
        if item != "\n":
            item2 = item.replace("\n", "")
            clean_channel_list.append(item2)
    clean_channel_list = clean_channel_list[1::]
    return clean_channel_list

#This function makes a list out of the program info that was stored in a file by fetch_text
def get_clean_list(filename):
    clean_fetched_text = []
    with open(filename, 'r') as f:
        fetched_text = f.readlines()
    #cleaning the list (removing \n and the first few lines)
    for item in fetched_text:
        if item != "\n":
            item2 = item.replace("\n", "")
            clean_fetched_text.append(item2)
    clean_fetched_text = clean_fetched_text[13::]
    return clean_fetched_text

#This function checks the program info for any channel names and makes a list of lists filled with program info based on channel (one list per channel)
def make_list(lst, programs):
    remove_list = ["Disney Channel", "13th Street", "Eurosport 1", "SBS 9"] #Part of a small hack to exclude certain channels.
    temp = [[]]
    overzicht = []
    position = -1
    for item in lst:
        for program in programs:
            if item == program:
                position += 1
                temp.append([])
        if position > -1:
            temp[position].append(item)
    for item_index in range(len(temp)-1):
        x = 0
        for item2 in remove_list: #Part of a small hack to exclude certain channels.
            if temp[item_index][0] == item2:
                x = 1
        if x == 0:
            overzicht.append(temp[item_index])

    return overzicht

#This function makes a list of all program information sorted bij time. It uses different formatting for entries that match the hitlist.
def time_line(lst):
    unsorted_lst = []
    for item1 in lst:
        for item2_index in range(len(item1)-1):
            if re.match( "..:..", item1[item2_index]):
                strng = item1[item2_index]+" "+item1[0][0:5]+":\t\t"+item1[item2_index+1].lower()
                for x in hit_lst:
                    if strng.find(x) != -1:
                        strng = item1[item2_index]+" "+item1[0][0:5]+":\t"+"HIT"+"\t"+item1[item2_index+1].upper()
                unsorted_lst.append(strng)
    sorted_lst = sorted(unsorted_lst)
    return sorted_lst

#This function shows a list of programs sorted by time that match the hitlist.
def hit_line(lst):
    unsorted_lst = []
    for item1 in lst:
        for item2_index in range(len(item1)-1):
            if re.match( "..:..", item1[item2_index]):
                strng = item1[item2_index]+" "+item1[0][0:5]+":\t\t"+item1[item2_index+1].lower()
                for x in hit_lst:
                    if strng.find(x) != -1:
                        unsorted_lst.append(strng)
    sorted_lst = sorted(unsorted_lst)
    return sorted_lst

#This function imports the hitlist from a file.
def hit_lst_import():
    with open(hit_list_filename, 'r') as f:
        hit_lst_import = f.readlines()
    hit_lst = []
    #cleaning up the hitlist
    for item in hit_lst_import:
        item2 = item.replace("\n", "")
        hit_lst.append(item2)
    return  hit_lst

#This function lets you view and edit the hitlist.
def edit_hit_lst(hit_lst):
    choice = "0"
    while choice != "3":
        print "\nThis is your hitlist:\n"
        for item in hit_lst:
            print item
        choice = raw_input("""\n\n
        What would you like to do? Please enter your choice.\n
        1 = Add item to hitlist
        2 = Delete item from hitlist
        3 = Exit to previous menu\n\n
        Your choice: """)
        if choice == "1":
            add_item = raw_input("please input (a part of) a name of a program to add. Please be precise and use lowercase only:\n")
            hit_lst.append(add_item)
            with open(hit_list_filename, "wb") as f:
                f.write("") #Just cleaning the file before i write to it with "a". There must be a better way to do this.
            for item in hit_lst:
                with open(hit_list_filename, "ab") as f:
                    f.write(item+"\n")
        elif choice == "2":
            del_item = raw_input("please input the entry to be deleted. Please be precise and use lowercase only:\n")
            hit_lst.remove(del_item)
            with open(hit_list_filename, "wb") as f:
                f.write("")
            for item in hit_lst:
                with open(hit_list_filename, "ab") as f:
                    f.write(item+"\n")
        elif choice == "3":
            break
        else:
            print "\n\tPlease input a number"

#Main program

hit_list_filename = "hitlist"
clean_list = get_clean_list("fetched_text")
channels = get_channels("fetched_channels")
made_list = make_list(clean_list, channels)
hit_lst = hit_lst_import()

print "\n"*20
print "\tWelcome to your TV Guide for tonight!\n"

choice = "0"
while choice != "4":
    choice = raw_input("""\n
    What would you like to do? Please enter your choice.\n
    1 = Check all programs for tonight
    2 = Check if any programs match your hitlist
    3 = Check your hitlist
    4 = Exit\n\n
    Your choice: """)
    if choice == "1":
        for item in time_line(made_list):
            print item
    elif choice == "2":
        print ""
        for item in hit_line(made_list):
            print item
    elif choice == "3":
        edit_hit_lst(hit_lst)
    elif choice == "4":
        "\n\t\tThank you and goodbye!"
        break #shouldn't need this, but just in case.
    else:
        print "\n\tPlease input a number"


#2

I'll admit I haven't thought about it very hard, but this looks like it could be done in one line with list comprehension and/or some built-in functions. Might possibly want a different data structure, all those empty lists really look weird.

temp = [[]]
position = -1
for item in lst:
    for program in programs:
        if item == program:
            position += 1
            temp.append([])
    if position > -1:
        temp[position].append(item)

---

#Just cleaning the file before i write to it with "a". There must be a better way to do this.

Yeah, you can do this while opening the file: https://docs.python.org/3/library/functions.html#open
That's a link to Python3's docs but it has a nicer table of those modes.. You can change the 3 in the url to 2 if you prefer.

---

x = 0
for item2 in remove_list: #Part of a small hack to exclude certain channels.
    if temp[item_index][0] == item2:
        x = 1
if x == 0:
    overzicht.append(temp[item_index])

Can be written as:

for item2 in remove_list: #Part of a small hack to exclude certain channels.
    if temp[item_index][0] == item2:
        break
else:
    overzicht.append(temp[item_index])

But really it should be:

if temp[item_index][0] not in remove_list:
    overzicht.append(temp[item_index])

---

temp item_index item2
bad names, name them after what they represent.

---

    clean_fetched_text = []
    with open(filename, 'r') as f:
        fetched_text = f.readlines()
    #cleaning the list (removing \n and the first few lines)
    for item in fetched_text:
        if item != "\n":
            item2 = item.replace("\n", "")
            clean_fetched_text.append(item2)

Should be more like this:

with open(filename) as f:
    fetched_text = [line for line in f.read().splitlines() if line]

And since you're doing this in more than once place you'll want to create a function that gets the non-empty lines of a file.

---

clean_fetched_text = clean_fetched_text[13::]
Don't know what you're removing or keeping, but it would be nicer if your code examined the lines and removed until the data you wanted is found.

---

if strng.find(x) != -1:

if x not in strng

(also terrible names)

---

I'd want to avoid dealing with strings when finding matches between two lists and such. Best to clean up the data once and then never deal with strings again, convert them to objects that can easily be sorted by their attributes and also make it easy to add more functionality like testing if two programmes overlap. These objects could then be added to some generic data structure to allow more efficient lookups/searches.


#3

Thank you so much for taking the time to give feedback.
It was very helpful.

I didn't know (or had forgotten about) "if X (not) in Y", which seems extremely usefull.
And i'm definitely going to spend some more time trying to get the hang of list comprehension. It looks so darn elegant... [line for line in f.read().splitlines() if line] was exactly what I was looking for but could not have figured out on my own.